import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df_final = pd.read_excel(r"C:\Users\evahr\Downloads\TFM-idealista\dataset_final.xlsx")
df_final.shape
(4418, 34)
def count_plot(col, target, data):
counts = (joined_df[[target, col]]
.groupby([target, col])
.size()
.unstack(target)
)
group_counts = counts.sum(axis='columns')
print(group_counts.sort_values(ascending=False).head(20))
print()
def column_plot(col, target, data, ax=None):
counts = (joined_df[[target, col]]
.groupby([target, col])
.size()
.unstack(target)
)
group_counts = counts.sum(axis='columns')
props = counts.div(group_counts, axis='index')
props.plot(kind="bar", stacked=False, ax=ax)
ax.legend().remove()
df_final.head()
| AÑO | MES_NUM | AÑO_MES | FECHA | PRECIO M2 | VARIACIÓN MENSUAL | VARIACIÓN TRIMESTRAL | VARIACIÓN ANUAL | DISTRITO | LATITUD | ... | ESPERANZA_VIDA | PARO_REGISTRADO | PRECIO_ALQUILER_M2 | PRECIO_VENTA_M2 | VIVIENDAS_COMPRAVENTA | EJECUCIONES_HIPOTECARIAS | IPV_VIVIENDA_NUEVA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | VALOR_TRANSACCIONES_SUELO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025 | 6 | 2025-06 | 2025-06-01 | 5729 | 0.01 | 0.073 | 0.181 | Arganzuela | 40.40021 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2025 | 5 | 2025-05 | 2025-05-01 | 5669 | 0.036 | 0.068 | 0.192 | Arganzuela | 40.40021 | ... | NaN | NaN | NaN | NaN | 1559.0 | NaN | NaN | NaN | NaN | NaN |
| 2 | 2025 | 4 | 2025-04 | 2025-04-01 | 5472 | 0.024 | 0.043 | 0.182 | Arganzuela | 40.40021 | ... | NaN | NaN | NaN | NaN | 1499.0 | NaN | NaN | NaN | NaN | NaN |
| 3 | 2025 | 3 | 2025-03 | 2025-03-01 | 5341 | 0.006 | 0.036 | 0.163 | Arganzuela | 40.40021 | ... | NaN | NaN | NaN | NaN | 1843.0 | 249.0 | 210.738 | 436.0 | 306.36 | 96670.26 |
| 4 | 2025 | 2 | 2025-02 | 2025-02-01 | 5310 | 0.012 | 0.049 | 0.162 | Arganzuela | 40.40021 | ... | NaN | NaN | NaN | NaN | 1786.0 | 249.0 | 210.738 | 436.0 | 306.36 | 96670.26 |
5 rows × 34 columns
df_final.columns
Index(['AÑO', 'MES_NUM', 'AÑO_MES', 'FECHA', 'PRECIO M2', 'VARIACIÓN MENSUAL',
'VARIACIÓN TRIMESTRAL', 'VARIACIÓN ANUAL', 'DISTRITO', 'LATITUD',
'LONGITUD', 'SHAPE_LENG', 'SHAPE_AREA', 'RANKING', 'VULNERABILIDAD',
'PRECIO_M2_DIST', 'EURIBOR', 'TRANSACCIONES_CM', 'INDICE_PRECIO',
'TOTAL_HIPOTECAS', 'CAPITAL_TOTAL_HIPOTECAS', 'POBLACION_ACTIVA',
'RENTA_MEDIA_HOGAR', 'POBLACION', 'ESPERANZA_VIDA', 'PARO_REGISTRADO',
'PRECIO_ALQUILER_M2', 'PRECIO_VENTA_M2', 'VIVIENDAS_COMPRAVENTA',
'EJECUCIONES_HIPOTECARIAS', 'IPV_VIVIENDA_NUEVA', 'TRANSACCIONES_SUELO',
'PRECIO_MEDIO_M2_CCMM', 'VALOR_TRANSACCIONES_SUELO'],
dtype='object')
# Contar tipos de variables
print(df_final.dtypes.value_counts())
float64 23 object 6 int64 4 datetime64[ns] 1 Name: count, dtype: int64
df_final.shape
(4418, 34)
cat_cols = df_final.select_dtypes(include='object').columns
num_cols = df_final.select_dtypes(include=['int64', 'float64']).columns
datetime_cols = df_final.select_dtypes(include=['datetime64[ns]']).columns
print("Categóricas:", cat_cols.tolist())
print("Numéricas:", num_cols.tolist())
print("Datetime:", datetime_cols.tolist())
Categóricas: ['AÑO_MES', 'VARIACIÓN MENSUAL', 'VARIACIÓN TRIMESTRAL', 'VARIACIÓN ANUAL', 'DISTRITO', 'LATITUD'] Numéricas: ['AÑO', 'MES_NUM', 'PRECIO M2', 'LONGITUD', 'SHAPE_LENG', 'SHAPE_AREA', 'RANKING', 'VULNERABILIDAD', 'PRECIO_M2_DIST', 'EURIBOR', 'TRANSACCIONES_CM', 'INDICE_PRECIO', 'TOTAL_HIPOTECAS', 'CAPITAL_TOTAL_HIPOTECAS', 'POBLACION_ACTIVA', 'RENTA_MEDIA_HOGAR', 'POBLACION', 'ESPERANZA_VIDA', 'PARO_REGISTRADO', 'PRECIO_ALQUILER_M2', 'PRECIO_VENTA_M2', 'VIVIENDAS_COMPRAVENTA', 'EJECUCIONES_HIPOTECARIAS', 'IPV_VIVIENDA_NUEVA', 'TRANSACCIONES_SUELO', 'PRECIO_MEDIO_M2_CCMM', 'VALOR_TRANSACCIONES_SUELO'] Datetime: ['FECHA']
# Convertir variables numéricas int a float
df_final[num_cols] = df_final[num_cols].astype(float)
# Contar tipos de variables
print(df_final.dtypes.value_counts())
float64 27 object 6 datetime64[ns] 1 Name: count, dtype: int64
df_final.dtypes
AÑO float64 MES_NUM float64 AÑO_MES object FECHA datetime64[ns] PRECIO M2 float64 VARIACIÓN MENSUAL object VARIACIÓN TRIMESTRAL object VARIACIÓN ANUAL object DISTRITO object LATITUD object LONGITUD float64 SHAPE_LENG float64 SHAPE_AREA float64 RANKING float64 VULNERABILIDAD float64 PRECIO_M2_DIST float64 EURIBOR float64 TRANSACCIONES_CM float64 INDICE_PRECIO float64 TOTAL_HIPOTECAS float64 CAPITAL_TOTAL_HIPOTECAS float64 POBLACION_ACTIVA float64 RENTA_MEDIA_HOGAR float64 POBLACION float64 ESPERANZA_VIDA float64 PARO_REGISTRADO float64 PRECIO_ALQUILER_M2 float64 PRECIO_VENTA_M2 float64 VIVIENDAS_COMPRAVENTA float64 EJECUCIONES_HIPOTECARIAS float64 IPV_VIVIENDA_NUEVA float64 TRANSACCIONES_SUELO float64 PRECIO_MEDIO_M2_CCMM float64 VALOR_TRANSACCIONES_SUELO float64 dtype: object
df_final.describe()
| AÑO | MES_NUM | FECHA | PRECIO M2 | LONGITUD | SHAPE_LENG | SHAPE_AREA | RANKING | VULNERABILIDAD | PRECIO_M2_DIST | ... | ESPERANZA_VIDA | PARO_REGISTRADO | PRECIO_ALQUILER_M2 | PRECIO_VENTA_M2 | VIVIENDAS_COMPRAVENTA | EJECUCIONES_HIPOTECARIAS | IPV_VIVIENDA_NUEVA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | VALOR_TRANSACCIONES_SUELO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4418.000000 | 4418.000000 | 4418 | 4418.000000 | 4418.000000 | 4418.000000 | 4.418000e+03 | 756.00000 | 756.000000 | 3789.000000 | ... | 4294.000000 | 1554.000000 | 3789.000000 | 4168.000000 | 4398.000000 | 2835.000000 | 4357.000000 | 4357.000000 | 4357.000000 | 4.357000e+03 |
| mean | 2016.214803 | 6.476460 | 2016-09-01 20:03:22.082390016 | 3227.063830 | -3.677022 | 23287.349423 | 2.888299e+07 | 11.00000 | 0.007728 | 12.633558 | ... | 84.445505 | 16298.703990 | 12.633558 | 3163.360845 | 1603.299227 | 548.244444 | 131.121532 | 379.420473 | 345.216842 | 2.210553e+05 |
| min | 2007.000000 | 1.000000 | 2007-05-01 00:00:00 | 1250.000000 | -3.745690 | 9019.544152 | 4.679185e+06 | 1.00000 | 0.005761 | 7.400000 | ... | 80.500000 | 2980.000000 | 7.400000 | 1174.000000 | 788.000000 | 116.000000 | 86.079000 | 148.000000 | 224.700000 | 2.976775e+04 |
| 25% | 2012.000000 | 3.000000 | 2012-05-01 00:00:00 | 2367.000000 | -3.702750 | 10866.335995 | 6.103168e+06 | 6.00000 | 0.006319 | 10.300000 | ... | 83.600000 | 9538.000000 | 10.300000 | 2318.000000 | 1212.000000 | 317.000000 | 107.976000 | 280.000000 | 288.860000 | 1.217268e+05 |
| 50% | 2016.000000 | 6.000000 | 2016-09-01 00:00:00 | 3027.500000 | -3.683070 | 18083.304616 | 1.404832e+07 | 11.00000 | 0.007590 | 12.100000 | ... | 84.500000 | 14946.000000 | 12.100000 | 2985.000000 | 1493.000000 | 406.000000 | 124.240000 | 361.000000 | 309.860000 | 1.805847e+05 |
| 75% | 2021.000000 | 9.000000 | 2021-02-01 00:00:00 | 3923.000000 | -3.651320 | 28142.972535 | 2.762398e+07 | 16.00000 | 0.008600 | 14.500000 | ... | 85.300000 | 20513.500000 | 14.500000 | 3871.250000 | 1864.000000 | 754.000000 | 148.812000 | 464.000000 | 377.520000 | 2.405415e+05 |
| max | 2025.000000 | 12.000000 | 2025-06-01 00:00:00 | 9818.000000 | -3.577770 | 94480.909361 | 2.378383e+08 | 21.00000 | 0.011500 | 25.480000 | ... | 89.400000 | 45756.000000 | 25.480000 | 8818.000000 | 3665.000000 | 1240.000000 | 210.738000 | 935.000000 | 641.820000 | 1.474330e+06 |
| std | 5.096514 | 3.455531 | NaN | 1198.661277 | 0.044342 | 18621.197062 | 4.936426e+07 | 6.05931 | 0.001577 | 3.087030 | ... | 1.338039 | 8570.792877 | 3.087030 | 1143.940642 | 549.876576 | 322.461326 | 30.036817 | 146.571187 | 91.462289 | 1.644689e+05 |
8 rows × 28 columns
df_final.describe(exclude=np.number)
| AÑO_MES | FECHA | VARIACIÓN MENSUAL | VARIACIÓN TRIMESTRAL | VARIACIÓN ANUAL | DISTRITO | LATITUD | |
|---|---|---|---|---|---|---|---|
| count | 4418 | 4418 | 4418.0 | 4418.000 | 4418 | 4418 | 4418 |
| unique | 218 | NaN | 121.0 | 193.000 | 427 | 21 | 21 |
| top | 2025-03 | NaN | 0.0 | 0.001 | n.d. | Carabanchel | 40.39094 |
| freq | 21 | NaN | 228.0 | 93.000 | 255 | 218 | 218 |
| mean | NaN | 2016-09-01 20:03:22.082390016 | NaN | NaN | NaN | NaN | NaN |
| min | NaN | 2007-05-01 00:00:00 | NaN | NaN | NaN | NaN | NaN |
| 25% | NaN | 2012-05-01 00:00:00 | NaN | NaN | NaN | NaN | NaN |
| 50% | NaN | 2016-09-01 00:00:00 | NaN | NaN | NaN | NaN | NaN |
| 75% | NaN | 2021-02-01 00:00:00 | NaN | NaN | NaN | NaN | NaN |
| max | NaN | 2025-06-01 00:00:00 | NaN | NaN | NaN | NaN | NaN |
df_final.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null float64 1 MES_NUM 4418 non-null float64 2 AÑO_MES 4418 non-null object 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null float64 5 VARIACIÓN MENSUAL 4418 non-null object 6 VARIACIÓN TRIMESTRAL 4418 non-null object 7 VARIACIÓN ANUAL 4418 non-null object 8 DISTRITO 4418 non-null object 9 LATITUD 4418 non-null object 10 LONGITUD 4418 non-null float64 11 SHAPE_LENG 4418 non-null float64 12 SHAPE_AREA 4418 non-null float64 13 RANKING 756 non-null float64 14 VULNERABILIDAD 756 non-null float64 15 PRECIO_M2_DIST 3789 non-null float64 16 EURIBOR 4418 non-null float64 17 TRANSACCIONES_CM 4418 non-null float64 18 INDICE_PRECIO 4357 non-null float64 19 TOTAL_HIPOTECAS 4398 non-null float64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null float64 21 POBLACION_ACTIVA 4418 non-null float64 22 RENTA_MEDIA_HOGAR 1920 non-null float64 23 POBLACION 4294 non-null float64 24 ESPERANZA_VIDA 4294 non-null float64 25 PARO_REGISTRADO 1554 non-null float64 26 PRECIO_ALQUILER_M2 3789 non-null float64 27 PRECIO_VENTA_M2 4168 non-null float64 28 VIVIENDAS_COMPRAVENTA 4398 non-null float64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null float64 30 IPV_VIVIENDA_NUEVA 4357 non-null float64 31 TRANSACCIONES_SUELO 4357 non-null float64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null float64 dtypes: datetime64[ns](1), float64(27), object(6) memory usage: 1.1+ MB
df_final.nunique()
AÑO 19 MES_NUM 12 AÑO_MES 218 FECHA 218 PRECIO M2 2617 VARIACIÓN MENSUAL 121 VARIACIÓN TRIMESTRAL 193 VARIACIÓN ANUAL 427 DISTRITO 21 LATITUD 21 LONGITUD 21 SHAPE_LENG 21 SHAPE_AREA 21 RANKING 21 VULNERABILIDAD 59 PRECIO_M2_DIST 472 EURIBOR 199 TRANSACCIONES_CM 215 INDICE_PRECIO 72 TOTAL_HIPOTECAS 213 CAPITAL_TOTAL_HIPOTECAS 217 POBLACION_ACTIVA 73 RENTA_MEDIA_HOGAR 160 POBLACION 3697 ESPERANZA_VIDA 66 PARO_REGISTRADO 1461 PRECIO_ALQUILER_M2 472 PRECIO_VENTA_M2 2527 VIVIENDAS_COMPRAVENTA 206 EJECUCIONES_HIPOTECARIAS 43 IPV_VIVIENDA_NUEVA 72 TRANSACCIONES_SUELO 67 PRECIO_MEDIO_M2_CCMM 71 VALOR_TRANSACCIONES_SUELO 72 dtype: int64
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(df_final)
print(f"Filas totales: {len(df_final):,}".replace(",", "."))
res
Filas totales: 4.418
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | RANKING | 3662 | 82.89 |
| 1 | VULNERABILIDAD | 3662 | 82.89 |
| 2 | PARO_REGISTRADO | 2864 | 64.83 |
| 3 | RENTA_MEDIA_HOGAR | 2498 | 56.54 |
| 4 | EJECUCIONES_HIPOTECARIAS | 1583 | 35.83 |
| 5 | PRECIO_ALQUILER_M2 | 629 | 14.24 |
| 6 | PRECIO_M2_DIST | 629 | 14.24 |
| 7 | PRECIO_VENTA_M2 | 250 | 5.66 |
| 8 | ESPERANZA_VIDA | 124 | 2.81 |
| 9 | POBLACION | 124 | 2.81 |
| 10 | IPV_VIVIENDA_NUEVA | 61 | 1.38 |
| 11 | VALOR_TRANSACCIONES_SUELO | 61 | 1.38 |
| 12 | INDICE_PRECIO | 61 | 1.38 |
| 13 | PRECIO_MEDIO_M2_CCMM | 61 | 1.38 |
| 14 | TRANSACCIONES_SUELO | 61 | 1.38 |
| 15 | CAPITAL_TOTAL_HIPOTECAS | 20 | 0.45 |
| 16 | VIVIENDAS_COMPRAVENTA | 20 | 0.45 |
| 17 | TOTAL_HIPOTECAS | 20 | 0.45 |
| 18 | MES_NUM | 0 | 0.00 |
| 19 | AÑO | 0 | 0.00 |
| 20 | TRANSACCIONES_CM | 0 | 0.00 |
| 21 | SHAPE_AREA | 0 | 0.00 |
| 22 | LONGITUD | 0 | 0.00 |
| 23 | EURIBOR | 0 | 0.00 |
| 24 | SHAPE_LENG | 0 | 0.00 |
| 25 | LATITUD | 0 | 0.00 |
| 26 | VARIACIÓN MENSUAL | 0 | 0.00 |
| 27 | PRECIO M2 | 0 | 0.00 |
| 28 | FECHA | 0 | 0.00 |
| 29 | VARIACIÓN TRIMESTRAL | 0 | 0.00 |
| 30 | AÑO_MES | 0 | 0.00 |
| 31 | VARIACIÓN ANUAL | 0 | 0.00 |
| 32 | DISTRITO | 0 | 0.00 |
| 33 | POBLACION_ACTIVA | 0 | 0.00 |
!pip install seaborn
Requirement already satisfied: seaborn in c:\users\evahr\anaconda3\envs\master\lib\site-packages (0.13.2) Requirement already satisfied: numpy!=1.24.0,>=1.20 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from seaborn) (2.2.0) Requirement already satisfied: pandas>=1.2 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from seaborn) (2.3.1) Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from seaborn) (3.10.5) Requirement already satisfied: contourpy>=1.0.1 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.3) Requirement already satisfied: cycler>=0.10 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.59.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.8) Requirement already satisfied: packaging>=20.0 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (25.0) Requirement already satisfied: pillow>=8 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (11.3.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.2.3) Requirement already satisfied: python-dateutil>=2.7 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from pandas>=1.2->seaborn) (2025.2) Requirement already satisfied: tzdata>=2022.7 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from pandas>=1.2->seaborn) (2025.2) Requirement already satisfied: six>=1.5 in c:\users\evahr\anaconda3\envs\master\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.17.0)
import seaborn as sns
sns.heatmap(df_final.isnull(), cbar=False, cmap='viridis')
plt.show()
# Seleccionar solo columnas de tipo int o float
numeric_df = df_final.select_dtypes(include=['int', 'float'])
# Calcular la matriz de correlación
numeric_df.corr()
| AÑO | MES_NUM | PRECIO M2 | LONGITUD | SHAPE_LENG | SHAPE_AREA | RANKING | VULNERABILIDAD | PRECIO_M2_DIST | EURIBOR | ... | ESPERANZA_VIDA | PARO_REGISTRADO | PRECIO_ALQUILER_M2 | PRECIO_VENTA_M2 | VIVIENDAS_COMPRAVENTA | EJECUCIONES_HIPOTECARIAS | IPV_VIVIENDA_NUEVA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | VALOR_TRANSACCIONES_SUELO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AÑO | 1.000000e+00 | -7.352193e-02 | 0.206790 | 2.986621e-02 | -6.185436e-04 | -3.064848e-03 | 4.344355e-13 | -4.519205e-02 | 0.503870 | -1.356077e-01 | ... | 0.540431 | -0.087457 | 0.503870 | 0.148952 | -4.425970e-01 | -8.079882e-01 | 6.981091e-01 | 2.013491e-01 | -7.641461e-01 | -4.280697e-01 |
| MES_NUM | -7.352193e-02 | 1.000000e+00 | -0.013049 | 6.745270e-04 | 1.828170e-03 | 1.619086e-03 | 4.496300e-19 | -6.750567e-18 | 0.034524 | 1.777684e-02 | ... | -0.026185 | -0.023303 | 0.034524 | 0.031795 | -2.397014e-01 | -5.474165e-02 | -2.324892e-02 | 5.388347e-02 | 2.862622e-02 | 3.042659e-02 |
| PRECIO M2 | 2.067904e-01 | -1.304870e-02 | 1.000000 | -1.383891e-01 | -1.369996e-01 | -6.311662e-02 | 8.376857e-01 | -8.153242e-01 | 0.886857 | 2.502553e-01 | ... | 0.296574 | -0.575752 | 0.886857 | 0.997552 | 1.008855e-01 | -3.452049e-01 | 3.963150e-01 | 2.090134e-02 | 5.004371e-02 | 8.275392e-02 |
| LONGITUD | 2.986621e-02 | 6.745270e-04 | -0.138389 | 1.000000e+00 | -7.257643e-02 | -1.485527e-01 | 1.538674e-02 | -2.506334e-02 | -0.216668 | -3.031590e-02 | ... | 0.033336 | -0.312548 | -0.216668 | -0.141393 | -2.359731e-02 | 2.388537e-15 | -1.301054e-03 | -4.662823e-03 | -3.911994e-02 | -4.488555e-02 |
| SHAPE_LENG | -6.185436e-04 | 1.828170e-03 | -0.137000 | -7.257643e-02 | 1.000000e+00 | 9.647131e-01 | 8.265096e-02 | -1.382604e-01 | -0.247490 | 2.070721e-03 | ... | 0.022902 | 0.017419 | -0.247490 | -0.136032 | 4.176848e-04 | -6.905401e-16 | 4.217373e-04 | 1.067329e-03 | 7.075054e-04 | 9.462774e-05 |
| SHAPE_AREA | -3.064848e-03 | 1.619086e-03 | -0.063117 | -1.485527e-01 | 9.647131e-01 | 1.000000e+00 | 1.176461e-01 | -1.587572e-01 | -0.172145 | 3.711215e-03 | ... | 0.016382 | 0.007102 | -0.172145 | -0.060743 | 2.400787e-03 | -6.381115e-16 | 2.500641e-04 | 8.040977e-04 | 3.854647e-03 | 3.421010e-03 |
| RANKING | 4.344355e-13 | 4.496300e-19 | 0.837686 | 1.538674e-02 | 8.265096e-02 | 1.176461e-01 | 1.000000e+00 | -9.697610e-01 | 0.648452 | -3.345345e-16 | ... | 0.385358 | -0.732612 | 0.648452 | 0.834498 | -1.189590e-15 | 2.424448e-16 | -4.551009e-15 | 5.107696e-18 | 1.272004e-15 | -2.772947e-16 |
| VULNERABILIDAD | -4.519205e-02 | -6.750567e-18 | -0.815324 | -2.506334e-02 | -1.382604e-01 | -1.587572e-01 | -9.697610e-01 | 1.000000e+00 | -0.618702 | 1.841494e-02 | ... | -0.367149 | 0.763981 | -0.618702 | -0.811032 | 3.736656e-03 | 1.797143e-02 | -4.331027e-02 | 3.949819e-02 | 4.835734e-03 | 1.997080e-02 |
| PRECIO_M2_DIST | 5.038695e-01 | 3.452395e-02 | 0.886857 | -2.166678e-01 | -2.474903e-01 | -1.721447e-01 | 6.484516e-01 | -6.187017e-01 | 1.000000 | 3.298499e-01 | ... | 0.356259 | -0.382603 | 1.000000 | 0.884634 | -2.509996e-02 | -5.099449e-01 | 6.190773e-01 | 3.582100e-02 | -1.399351e-01 | 2.921590e-02 |
| EURIBOR | -1.356077e-01 | 1.777684e-02 | 0.250255 | -3.031590e-02 | 2.070721e-03 | 3.711215e-03 | -3.345345e-16 | 1.841494e-02 | 0.329850 | 1.000000e+00 | ... | -0.047809 | -0.176598 | 0.329850 | 0.228890 | 2.894284e-01 | -3.210169e-01 | 4.373884e-01 | -2.619913e-01 | 4.498442e-01 | 3.424542e-01 |
| TRANSACCIONES_CM | 5.841391e-01 | -2.308263e-01 | 0.250079 | -1.299248e-03 | 3.766439e-04 | -1.488495e-04 | -1.357196e-15 | 2.249037e-02 | 0.395406 | 3.190234e-02 | ... | 0.330311 | -0.044246 | 0.395406 | 0.217330 | 2.582011e-01 | -5.993209e-01 | 6.607787e-01 | 4.280927e-01 | -2.306137e-01 | 1.083311e-01 |
| INDICE_PRECIO | 5.356275e-01 | -7.262980e-03 | 0.415421 | -1.173685e-02 | 7.999274e-04 | 1.336666e-03 | -6.596773e-16 | -4.732883e-02 | 0.620476 | 4.862401e-01 | ... | 0.200025 | -0.128965 | 0.620476 | 0.403015 | 1.562570e-01 | -8.477761e-01 | 9.640192e-01 | 1.769544e-01 | -3.426770e-03 | 1.920096e-01 |
| TOTAL_HIPOTECAS | -2.094948e-01 | -8.742942e-02 | 0.225617 | -3.600234e-02 | 5.452705e-04 | 3.287063e-03 | -2.872359e-16 | 6.402200e-04 | 0.309151 | 4.122162e-01 | ... | -0.206041 | -0.033900 | 0.309151 | 0.241863 | 5.932853e-01 | -5.289138e-01 | 3.503098e-01 | 1.658771e-01 | 5.217309e-01 | 6.098691e-01 |
| CAPITAL_TOTAL_HIPOTECAS | -3.081577e-02 | -5.486278e-02 | 0.276199 | -3.668179e-02 | 9.661665e-04 | 3.367770e-03 | 5.102551e-16 | 1.499072e-03 | 0.444412 | 4.895612e-01 | ... | -0.121500 | -0.077989 | 0.444412 | 0.271071 | 4.627069e-01 | -6.708650e-01 | 5.079528e-01 | 2.474852e-01 | 4.010630e-01 | 5.906280e-01 |
| POBLACION_ACTIVA | 6.581625e-01 | -3.321774e-02 | 0.386976 | 1.088078e-02 | -1.763852e-04 | -7.501924e-04 | -3.078538e-15 | -3.290336e-02 | 0.552732 | 4.312246e-01 | ... | 0.363614 | -0.113635 | 0.552732 | 0.338677 | 2.095842e-02 | -6.186179e-01 | 8.889580e-01 | -2.918657e-03 | -1.552232e-01 | -1.534386e-01 |
| RENTA_MEDIA_HOGAR | 1.966314e-01 | 1.764194e-16 | 0.818114 | -5.355060e-02 | 1.450736e-01 | 1.655718e-01 | 9.565374e-01 | -9.170225e-01 | 0.624203 | 5.260507e-02 | ... | 0.471798 | -0.649157 | 0.624203 | 0.813559 | 1.064499e-01 | -1.482579e-01 | 1.958755e-01 | 4.348556e-02 | -8.476857e-02 | 3.399180e-02 |
| POBLACION | 3.392325e-01 | -2.612343e-03 | 0.032608 | -3.868073e-01 | 1.681931e-01 | 1.767426e-01 | -3.186881e-01 | 3.310839e-01 | 0.267225 | 3.875935e-01 | ... | 0.181476 | 0.468441 | 0.267225 | 0.022815 | -8.242383e-02 | -2.880216e-01 | 4.786616e-01 | -1.197168e-01 | -1.005243e-01 | -8.595348e-02 |
| ESPERANZA_VIDA | 5.404314e-01 | -2.618461e-02 | 0.296574 | 3.333616e-02 | 2.290186e-02 | 1.638198e-02 | 3.853576e-01 | -3.671492e-01 | 0.356259 | -4.780950e-02 | ... | 1.000000 | -0.398442 | 0.356259 | 0.289088 | -2.800397e-01 | -1.889344e-01 | 3.179217e-01 | 1.527561e-01 | -4.205802e-01 | -2.740467e-01 |
| PARO_REGISTRADO | -8.745679e-02 | -2.330310e-02 | -0.575752 | -3.125479e-01 | 1.741884e-02 | 7.102130e-03 | -7.326120e-01 | 7.639814e-01 | -0.382603 | -1.765983e-01 | ... | -0.398442 | 1.000000 | -0.382603 | -0.561874 | 7.829296e-02 | 6.973639e-02 | -1.085155e-01 | 4.514166e-02 | -2.049480e-02 | -9.187782e-03 |
| PRECIO_ALQUILER_M2 | 5.038695e-01 | 3.452395e-02 | 0.886857 | -2.166678e-01 | -2.474903e-01 | -1.721447e-01 | 6.484516e-01 | -6.187017e-01 | 1.000000 | 3.298499e-01 | ... | 0.356259 | -0.382603 | 1.000000 | 0.884634 | -2.509996e-02 | -5.099449e-01 | 6.190773e-01 | 3.582100e-02 | -1.399351e-01 | 2.921590e-02 |
| PRECIO_VENTA_M2 | 1.489522e-01 | 3.179538e-02 | 0.997552 | -1.413932e-01 | -1.360324e-01 | -6.074347e-02 | 8.344983e-01 | -8.110321e-01 | 0.884634 | 2.288898e-01 | ... | 0.289088 | -0.561874 | 0.884634 | 1.000000 | 9.865250e-02 | -3.376590e-01 | 3.809595e-01 | 2.289554e-02 | 7.200647e-02 | 1.076956e-01 |
| VIVIENDAS_COMPRAVENTA | -4.425970e-01 | -2.397014e-01 | 0.100885 | -2.359731e-02 | 4.176848e-04 | 2.400787e-03 | -1.189590e-15 | 3.736656e-03 | -0.025100 | 2.894284e-01 | ... | -0.280040 | 0.078293 | -0.025100 | 0.098652 | 1.000000e+00 | -1.977456e-01 | 1.907372e-02 | -3.082304e-02 | 5.676465e-01 | 3.783199e-01 |
| EJECUCIONES_HIPOTECARIAS | -8.079882e-01 | -5.474165e-02 | -0.345205 | 2.388537e-15 | -6.905401e-16 | -6.381115e-16 | 2.424448e-16 | 1.797143e-02 | -0.509945 | -3.210169e-01 | ... | -0.188934 | 0.069736 | -0.509945 | -0.337659 | -1.977456e-01 | 1.000000e+00 | -7.756671e-01 | -5.628451e-02 | 3.007214e-01 | 5.313357e-02 |
| IPV_VIVIENDA_NUEVA | 6.981091e-01 | -2.324892e-02 | 0.396315 | -1.301054e-03 | 4.217373e-04 | 2.500641e-04 | -4.551009e-15 | -4.331027e-02 | 0.619077 | 4.373884e-01 | ... | 0.317922 | -0.108516 | 0.619077 | 0.380959 | 1.907372e-02 | -7.756671e-01 | 1.000000e+00 | 1.588828e-01 | -1.827471e-01 | 3.133672e-02 |
| TRANSACCIONES_SUELO | 2.013491e-01 | 5.388347e-02 | 0.020901 | -4.662823e-03 | 1.067329e-03 | 8.040977e-04 | 5.107696e-18 | 3.949819e-02 | 0.035821 | -2.619913e-01 | ... | 0.152756 | 0.045142 | 0.035821 | 0.022896 | -3.082304e-02 | -5.628451e-02 | 1.588828e-01 | 1.000000e+00 | -1.560270e-01 | 3.427101e-01 |
| PRECIO_MEDIO_M2_CCMM | -7.641461e-01 | 2.862622e-02 | 0.050044 | -3.911994e-02 | 7.075054e-04 | 3.854647e-03 | 1.272004e-15 | 4.835734e-03 | -0.139935 | 4.498442e-01 | ... | -0.420580 | -0.020495 | -0.139935 | 0.072006 | 5.676465e-01 | 3.007214e-01 | -1.827471e-01 | -1.560270e-01 | 1.000000e+00 | 6.236637e-01 |
| VALOR_TRANSACCIONES_SUELO | -4.280697e-01 | 3.042659e-02 | 0.082754 | -4.488555e-02 | 9.462774e-05 | 3.421010e-03 | -2.772947e-16 | 1.997080e-02 | 0.029216 | 3.424542e-01 | ... | -0.274047 | -0.009188 | 0.029216 | 0.107696 | 3.783199e-01 | 5.313357e-02 | 3.133672e-02 | 3.427101e-01 | 6.236637e-01 | 1.000000e+00 |
27 rows × 27 columns
corr = df_final.select_dtypes(include=['number']).corr()
corr_3d = corr.round(3) # 3 decimales
corr_3d
| AÑO | MES_NUM | PRECIO M2 | LONGITUD | SHAPE_LENG | SHAPE_AREA | RANKING | VULNERABILIDAD | PRECIO_M2_DIST | EURIBOR | ... | ESPERANZA_VIDA | PARO_REGISTRADO | PRECIO_ALQUILER_M2 | PRECIO_VENTA_M2 | VIVIENDAS_COMPRAVENTA | EJECUCIONES_HIPOTECARIAS | IPV_VIVIENDA_NUEVA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | VALOR_TRANSACCIONES_SUELO | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AÑO | 1.000 | -0.074 | 0.207 | 0.030 | -0.001 | -0.003 | 0.000 | -0.045 | 0.504 | -0.136 | ... | 0.540 | -0.087 | 0.504 | 0.149 | -0.443 | -0.808 | 0.698 | 0.201 | -0.764 | -0.428 |
| MES_NUM | -0.074 | 1.000 | -0.013 | 0.001 | 0.002 | 0.002 | 0.000 | -0.000 | 0.035 | 0.018 | ... | -0.026 | -0.023 | 0.035 | 0.032 | -0.240 | -0.055 | -0.023 | 0.054 | 0.029 | 0.030 |
| PRECIO M2 | 0.207 | -0.013 | 1.000 | -0.138 | -0.137 | -0.063 | 0.838 | -0.815 | 0.887 | 0.250 | ... | 0.297 | -0.576 | 0.887 | 0.998 | 0.101 | -0.345 | 0.396 | 0.021 | 0.050 | 0.083 |
| LONGITUD | 0.030 | 0.001 | -0.138 | 1.000 | -0.073 | -0.149 | 0.015 | -0.025 | -0.217 | -0.030 | ... | 0.033 | -0.313 | -0.217 | -0.141 | -0.024 | 0.000 | -0.001 | -0.005 | -0.039 | -0.045 |
| SHAPE_LENG | -0.001 | 0.002 | -0.137 | -0.073 | 1.000 | 0.965 | 0.083 | -0.138 | -0.247 | 0.002 | ... | 0.023 | 0.017 | -0.247 | -0.136 | 0.000 | -0.000 | 0.000 | 0.001 | 0.001 | 0.000 |
| SHAPE_AREA | -0.003 | 0.002 | -0.063 | -0.149 | 0.965 | 1.000 | 0.118 | -0.159 | -0.172 | 0.004 | ... | 0.016 | 0.007 | -0.172 | -0.061 | 0.002 | -0.000 | 0.000 | 0.001 | 0.004 | 0.003 |
| RANKING | 0.000 | 0.000 | 0.838 | 0.015 | 0.083 | 0.118 | 1.000 | -0.970 | 0.648 | -0.000 | ... | 0.385 | -0.733 | 0.648 | 0.834 | -0.000 | 0.000 | -0.000 | 0.000 | 0.000 | -0.000 |
| VULNERABILIDAD | -0.045 | -0.000 | -0.815 | -0.025 | -0.138 | -0.159 | -0.970 | 1.000 | -0.619 | 0.018 | ... | -0.367 | 0.764 | -0.619 | -0.811 | 0.004 | 0.018 | -0.043 | 0.039 | 0.005 | 0.020 |
| PRECIO_M2_DIST | 0.504 | 0.035 | 0.887 | -0.217 | -0.247 | -0.172 | 0.648 | -0.619 | 1.000 | 0.330 | ... | 0.356 | -0.383 | 1.000 | 0.885 | -0.025 | -0.510 | 0.619 | 0.036 | -0.140 | 0.029 |
| EURIBOR | -0.136 | 0.018 | 0.250 | -0.030 | 0.002 | 0.004 | -0.000 | 0.018 | 0.330 | 1.000 | ... | -0.048 | -0.177 | 0.330 | 0.229 | 0.289 | -0.321 | 0.437 | -0.262 | 0.450 | 0.342 |
| TRANSACCIONES_CM | 0.584 | -0.231 | 0.250 | -0.001 | 0.000 | -0.000 | -0.000 | 0.022 | 0.395 | 0.032 | ... | 0.330 | -0.044 | 0.395 | 0.217 | 0.258 | -0.599 | 0.661 | 0.428 | -0.231 | 0.108 |
| INDICE_PRECIO | 0.536 | -0.007 | 0.415 | -0.012 | 0.001 | 0.001 | -0.000 | -0.047 | 0.620 | 0.486 | ... | 0.200 | -0.129 | 0.620 | 0.403 | 0.156 | -0.848 | 0.964 | 0.177 | -0.003 | 0.192 |
| TOTAL_HIPOTECAS | -0.209 | -0.087 | 0.226 | -0.036 | 0.001 | 0.003 | -0.000 | 0.001 | 0.309 | 0.412 | ... | -0.206 | -0.034 | 0.309 | 0.242 | 0.593 | -0.529 | 0.350 | 0.166 | 0.522 | 0.610 |
| CAPITAL_TOTAL_HIPOTECAS | -0.031 | -0.055 | 0.276 | -0.037 | 0.001 | 0.003 | 0.000 | 0.001 | 0.444 | 0.490 | ... | -0.122 | -0.078 | 0.444 | 0.271 | 0.463 | -0.671 | 0.508 | 0.247 | 0.401 | 0.591 |
| POBLACION_ACTIVA | 0.658 | -0.033 | 0.387 | 0.011 | -0.000 | -0.001 | -0.000 | -0.033 | 0.553 | 0.431 | ... | 0.364 | -0.114 | 0.553 | 0.339 | 0.021 | -0.619 | 0.889 | -0.003 | -0.155 | -0.153 |
| RENTA_MEDIA_HOGAR | 0.197 | 0.000 | 0.818 | -0.054 | 0.145 | 0.166 | 0.957 | -0.917 | 0.624 | 0.053 | ... | 0.472 | -0.649 | 0.624 | 0.814 | 0.106 | -0.148 | 0.196 | 0.043 | -0.085 | 0.034 |
| POBLACION | 0.339 | -0.003 | 0.033 | -0.387 | 0.168 | 0.177 | -0.319 | 0.331 | 0.267 | 0.388 | ... | 0.181 | 0.468 | 0.267 | 0.023 | -0.082 | -0.288 | 0.479 | -0.120 | -0.101 | -0.086 |
| ESPERANZA_VIDA | 0.540 | -0.026 | 0.297 | 0.033 | 0.023 | 0.016 | 0.385 | -0.367 | 0.356 | -0.048 | ... | 1.000 | -0.398 | 0.356 | 0.289 | -0.280 | -0.189 | 0.318 | 0.153 | -0.421 | -0.274 |
| PARO_REGISTRADO | -0.087 | -0.023 | -0.576 | -0.313 | 0.017 | 0.007 | -0.733 | 0.764 | -0.383 | -0.177 | ... | -0.398 | 1.000 | -0.383 | -0.562 | 0.078 | 0.070 | -0.109 | 0.045 | -0.020 | -0.009 |
| PRECIO_ALQUILER_M2 | 0.504 | 0.035 | 0.887 | -0.217 | -0.247 | -0.172 | 0.648 | -0.619 | 1.000 | 0.330 | ... | 0.356 | -0.383 | 1.000 | 0.885 | -0.025 | -0.510 | 0.619 | 0.036 | -0.140 | 0.029 |
| PRECIO_VENTA_M2 | 0.149 | 0.032 | 0.998 | -0.141 | -0.136 | -0.061 | 0.834 | -0.811 | 0.885 | 0.229 | ... | 0.289 | -0.562 | 0.885 | 1.000 | 0.099 | -0.338 | 0.381 | 0.023 | 0.072 | 0.108 |
| VIVIENDAS_COMPRAVENTA | -0.443 | -0.240 | 0.101 | -0.024 | 0.000 | 0.002 | -0.000 | 0.004 | -0.025 | 0.289 | ... | -0.280 | 0.078 | -0.025 | 0.099 | 1.000 | -0.198 | 0.019 | -0.031 | 0.568 | 0.378 |
| EJECUCIONES_HIPOTECARIAS | -0.808 | -0.055 | -0.345 | 0.000 | -0.000 | -0.000 | 0.000 | 0.018 | -0.510 | -0.321 | ... | -0.189 | 0.070 | -0.510 | -0.338 | -0.198 | 1.000 | -0.776 | -0.056 | 0.301 | 0.053 |
| IPV_VIVIENDA_NUEVA | 0.698 | -0.023 | 0.396 | -0.001 | 0.000 | 0.000 | -0.000 | -0.043 | 0.619 | 0.437 | ... | 0.318 | -0.109 | 0.619 | 0.381 | 0.019 | -0.776 | 1.000 | 0.159 | -0.183 | 0.031 |
| TRANSACCIONES_SUELO | 0.201 | 0.054 | 0.021 | -0.005 | 0.001 | 0.001 | 0.000 | 0.039 | 0.036 | -0.262 | ... | 0.153 | 0.045 | 0.036 | 0.023 | -0.031 | -0.056 | 0.159 | 1.000 | -0.156 | 0.343 |
| PRECIO_MEDIO_M2_CCMM | -0.764 | 0.029 | 0.050 | -0.039 | 0.001 | 0.004 | 0.000 | 0.005 | -0.140 | 0.450 | ... | -0.421 | -0.020 | -0.140 | 0.072 | 0.568 | 0.301 | -0.183 | -0.156 | 1.000 | 0.624 |
| VALOR_TRANSACCIONES_SUELO | -0.428 | 0.030 | 0.083 | -0.045 | 0.000 | 0.003 | -0.000 | 0.020 | 0.029 | 0.342 | ... | -0.274 | -0.009 | 0.029 | 0.108 | 0.378 | 0.053 | 0.031 | 0.343 | 0.624 | 1.000 |
27 rows × 27 columns
corr.to_excel(r"C:\Users\evahr\Downloads\correlación.xlsx", index=False)
#plt.figure(figsize=(10, 6))
#sns.heatmap(df_final[num_cols].corr(), annot=True, cmap='coolwarm')
#plt.title("Matriz de correlación")
#plt.show()
num = (df_final
.select_dtypes(include="number")
.drop(columns=["DISTRITO"], errors="ignore"))
corr = num.corr()
sns.heatmap(corr, cmap="RdBu_r", vmin=-1, vmax=1, center=0)
plt.title("Matriz de correlación")
plt.show()
# Eliminar variables cols = ["FECHA","VARIACIÓN MENSUAL","VARIACIÓN TRIMESTRAL","VARIACIÓN ANUAL", "DISTRITO"] df_final1 = df_final.drop(columns=cols, errors="ignore") df_final1.info()
# Convierte a número s = df_final1['LATITUD'].astype(str).str.strip() # Convierte a número df_final1['LATITUD'] = pd.to_numeric(s, errors='coerce').astype('Float64') df_final1.info()
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Umbral de correlación peligrosa
thr = 0.80
# Ocultar mitad superior y correlaciones débiles
mask_upper = np.triu(np.ones_like(corr, dtype=bool))
mask_weak = corr.abs() < thr
mask = mask_upper | mask_weak
plt.figure(figsize=(12,10), dpi=160)
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f",
vmin=-1, vmax=1, center=0, cmap="RdBu_r",
linewidths=.5, cbar_kws={"shrink": .8})
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.title(f"Matriz de correlación (solo |r| ≥ {thr})")
plt.tight_layout()
plt.show()
Pares de variables con multicolinealidad fuerte (|r| > 0.8).
PRECIO_VENTA_M2 (0.99)
PRECIO_ALQUILER_M2 (0.89)
PRECIO_M2_DIST (0.89)
RENTA_MEDIA_HOGAR (0.82)
RANKING ↔ RENTA_MEDIA_HOGAR (0.96) y ↔ VULNERABILIDAD (-0.97).
SHAPE_LENG ↔ SHAPE_AREA (0.96).
PRECIO_ALQUILER_M2 ↔ PRECIO_M2_DIST (1.0) Y PRECIO_VENTA_M2 (0.88)
INDICE_PRECIO ↔ IPV_VIVIENDA_NUEVA (0.96).
TOTAL_HIPOTECAS ↔ CAPITAL_TOTAL_HIPOTECAS (0.89).
POBLACION_ACTIVA ↔ IPV_VIVIENDA_NUEVA (0.89).
thr = 0.30
mask_upper = np.triu(np.ones_like(corr, bool)) # mitad superior
mask_weak = corr.abs() < thr # correlaciones débiles
mask = mask_upper | mask_weak # ocultar ambas
plt.figure(figsize=(12,10), dpi=160)
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f",
vmin=-1, vmax=1, center=0, cmap="RdBu_r",
linewidths=.5, cbar_kws={"shrink": .8})
plt.xticks(rotation=45, ha="right"); plt.yticks(rotation=0)
plt.title(f"Matriz de correlación (anoto solo |r|≥{thr})")
plt.tight_layout(); plt.show()
sns.clustermap(
corr, cmap="RdBu_r", vmin=-1, vmax=1, center=0,
linewidths=.2, figsize=(11,11), dendrogram_ratio=.12,
cbar_pos=(.02, .8, .03, .18)) # posición de la barra
plt.show()
import matplotlib.pyplot as plt
import math
# Variable principal
target = "PRECIO M2"
# Seleccionar solo las columnas numéricas
numeric_cols = df_final.select_dtypes(include="number").columns
# Quitar la variable objetivo
numeric_cols = [col for col in numeric_cols if col != target]
# Definir número de filas y columnas para la rejilla
n = len(numeric_cols)
cols = 3 # número de columnas en la rejilla
rows = math.ceil(n / cols)
# Crear figura y ejes
fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
axes = axes.flatten()
# Crear un scatter plot en cada subplot (PRECIO M2 en eje X)
for i, col in enumerate(numeric_cols):
axes[i].scatter(df_final[target], df_final[col], alpha=0.5)
axes[i].set_xlabel(target)
axes[i].set_ylabel(col)
axes[i].set_title(f"{target} vs {col}")
# Quitar ejes vacíos si sobran
for j in range(i+1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
sns.regplot(data=df_final, x="RENTA_MEDIA_HOGAR", y="PRECIO M2",
scatter_kws={'s':12, 'alpha':0.4})
plt.tight_layout(); plt.show()
df_final.select_dtypes('number').hist(figsize=(20,20), bins=30, grid=False)
plt.tight_layout(pad=2.0, w_pad=1.5, h_pad=1.8)
plt.show()
# Contar filas duplicadas
duplicados_por_fila = df_final.duplicated().sum()
print("Valores duplicados por fila:")
print(duplicados_por_fila)
Valores duplicados por fila: 0
for col in num_cols:
plt.figure()
sns.boxplot(data=df_final[col])
plt.title(f'Outliers en {col}')
plt.show()
# --- resumen de outliers por columna numérica (regla 1.5*IQR) ---
def outlier_stats_series(s: pd.Series):
x = pd.to_numeric(s, errors='coerce').dropna()
n = len(x)
if n == 0:
return 0, 0.0, 0.0, np.nan, np.nan, 0
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
iqr = q3 - q1
if iqr == 0:
return 0, 0.0, 0.0, q1, q3, n
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
count = int(((x < lower) | (x > upper)).sum())
pct_var = (count / n) * 100
return count, pct_var, lower, upper, n
total_filas = len(df_final)
rows = []
for col in num_cols:
c, pct_var, lower, upper, n = outlier_stats_series(df_final[col])
pct_total = (c / total_filas * 100) if total_filas > 0 else 0.0
rows.append({
'variable': col,
'n_obs_no_nulas': n,
'outliers': c,
'%_sobre_variable': round(pct_var, 1),
'%_sobre_total_filas': round(pct_total, 1)
})
summary_df = (pd.DataFrame(rows)
.sort_values(['%_sobre_variable','outliers'], ascending=False)
.reset_index(drop=True))
# Muestra la tabla bonita en consola
print(summary_df.to_string(index=False))
variable n_obs_no_nulas outliers %_sobre_variable %_sobre_total_filas
VALOR_TRANSACCIONES_SUELO 4357 518 11.9 11.7
PRECIO_MEDIO_M2_CCMM 4357 466 10.7 10.5
SHAPE_LENG 4418 215 4.9 4.9
SHAPE_AREA 4418 215 4.9 4.9
POBLACION_ACTIVA 4418 187 4.2 4.2
POBLACION 4294 144 3.4 3.3
PARO_REGISTRADO 1554 45 2.9 1.0
CAPITAL_TOTAL_HIPOTECAS 4398 116 2.6 2.6
VIVIENDAS_COMPRAVENTA 4398 113 2.6 2.6
ESPERANZA_VIDA 4294 102 2.4 2.3
PRECIO M2 4418 88 2.0 2.0
TOTAL_HIPOTECAS 4398 73 1.7 1.7
PRECIO_M2_DIST 3789 57 1.5 1.3
PRECIO_ALQUILER_M2 3789 57 1.5 1.3
IPV_VIVIENDA_NUEVA 4357 63 1.4 1.4
TRANSACCIONES_SUELO 4357 63 1.4 1.4
PRECIO_VENTA_M2 4168 57 1.4 1.3
AÑO 4418 0 0.0 0.0
MES_NUM 4418 0 0.0 0.0
LONGITUD 4418 0 0.0 0.0
RANKING 756 0 0.0 0.0
VULNERABILIDAD 756 0 0.0 0.0
EURIBOR 4418 0 0.0 0.0
TRANSACCIONES_CM 4418 0 0.0 0.0
INDICE_PRECIO 4357 0 0.0 0.0
RENTA_MEDIA_HOGAR 1920 0 0.0 0.0
EJECUCIONES_HIPOTECARIAS 2835 0 0.0 0.0
df = df_final.copy()
col = 'PRECIO M2'
# --- (Opcional) filtra por año concreto ---
# df = df[df['AÑO'] == 2022]
# --- Limpieza numérica robusta para 'PRECIO M2' ---
if df[col].dtype == 'O':
df[col] = (df[col].astype(str)
.str.replace('.', '', regex=False)
.str.replace(',', '.', regex=False)
.str.extract(r'([-+]?\d*\.?\d+)', expand=False)
.astype(float))
# Limpieza básica
df = df.dropna(subset=[col])
df = df[df[col] > 0]
# --- Orden de distritos por mediana de precio ---
orden = (df.groupby('DISTRITO')[col]
.median()
.sort_values(ascending=False)
.index.tolist())
# --- Cálculo de atípicos por distrito (regla 1.5*IQR) ---
total_global = df[col].notna().sum()
rows = []
for d, g in df.groupby('DISTRITO'):
x = g[col].dropna()
n = int(x.shape[0])
if n == 0:
out = 0
else:
q1 = x.quantile(0.25)
q3 = x.quantile(0.75)
iqr = q3 - q1
if iqr == 0:
out = 0
else:
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
out = int(((x < lower) | (x > upper)).sum())
pct_d = (out / n * 100) if n > 0 else 0.0
pct_g = (out / total_global * 100) if total_global > 0 else 0.0
rows.append({
'DISTRITO': d,
'N_total': n,
'N_atipicos': out,
'%_atipicos_sobre_distrito': pct_d,
'%_atipicos_sobre_total': pct_g
})
resumen = (pd.DataFrame(rows)
.set_index('DISTRITO')
.loc[orden]
.reset_index())
# --- Boxplot ---
data = [df.loc[df['DISTRITO'] == d, col].values for d in orden]
plt.figure(figsize=(14, 8))
plt.boxplot(data, labels=orden, showfliers=True)
plt.title(f'{col} por distrito (boxplot) — con nº atípicos y % distrital')
plt.ylabel('€/m²')
plt.xticks(rotation=75, ha='right')
# --- Anotaciones: "n (pct_d%)" encima de cada caja ---
ax = plt.gca()
ymin, ymax = ax.get_ylim()
for i, d in enumerate(orden, start=1):
row = resumen.loc[resumen['DISTRITO'] == d].iloc[0]
ax.text(i, ymax, f"{int(row['N_atipicos'])} ({row['%_atipicos_sobre_distrito']:.1f}%)",
ha='center', va='bottom', fontsize=8, rotation=90)
ax.set_ylim(ymin, ymax * 1.15)
plt.tight_layout()
plt.show()
# --- Tabla resumen ordenada por % de atípicos sobre distrito ---
resumen_sorted = resumen.sort_values('%_atipicos_sobre_distrito', ascending=False)
print(resumen_sorted.to_string(index=False))
C:\Users\evahr\AppData\Local\Temp\ipykernel_13220\1305264832.py:62: MatplotlibDeprecationWarning: The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11. plt.boxplot(data, labels=orden, showfliers=True)
DISTRITO N_total N_atipicos %_atipicos_sobre_distrito %_atipicos_sobre_total
Salamanca 212 12 5.660377 0.271616
Chamartin 213 11 5.164319 0.248981
Moncloa-Aravaca 215 10 4.651163 0.226347
Tetuan 218 8 3.669725 0.181077
Retiro 208 7 3.365385 0.158443
Chamberi 210 7 3.333333 0.158443
Fuencarral-El Pardo 215 5 2.325581 0.113173
Moratalaz 190 4 2.105263 0.090539
Hortaleza 213 3 1.408451 0.067904
Ciudad Lineal 216 3 1.388889 0.067904
Centro 217 3 1.382488 0.067904
Barajas 206 2 0.970874 0.045269
Villaverde 208 2 0.961538 0.045269
Arganzuela 212 2 0.943396 0.045269
Latina 212 1 0.471698 0.022635
San Blas-Canillejas 215 0 0.000000 0.000000
Villa De Vallecas 206 0 0.000000 0.000000
Vicalvaro 185 0 0.000000 0.000000
Carabanchel 218 0 0.000000 0.000000
Usera 213 0 0.000000 0.000000
Puente De Vallecas 216 0 0.000000 0.000000
# Convierte celdas vacías y variantes de "n.d." en NaN
# Vacíos y "n.d." (con o sin puntos/espacios, mayúsculas/minúsculas)
dataset_final_pycaret = df_final.replace(
{
r'^\s*$': np.nan,
r'(?i)^(?:n\.?\s*d\.?)$': np.nan # "n.d.", "n d", "nd", "N.D.", etc.
},
regex=True
)
C:\Users\evahr\AppData\Local\Temp\ipykernel_14240\257074455.py:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
dataset_final_pycaret = df_final.replace(
# Convierte celdas vacías y variantes de "n.d." en NaN
# Vacíos y "n.d." (con o sin puntos/espacios, mayúsculas/minúsculas)
pd.set_option('future.no_silent_downcasting', True)
patrones = {r'^\s*$': np.nan, r'(?i)^(?:n\.?\s*d\.?)$': np.nan}
dataset_final_pycaret = df_final.replace(patrones, regex=True)
# Si quieres tipos “bonitos” (Int64/Float64/string/boolean):
dataset_final_pycaret = dataset_final_pycaret.convert_dtypes()
dataset_final_pycaret.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Int64 1 MES_NUM 4418 non-null Int64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Int64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null string 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Int64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Int64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Int64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Int64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Int64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Int64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Int64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Int64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Int64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Int64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(17), Int64(13), datetime64[ns](1), string(3) memory usage: 1.3 MB
# Probar solo latitud - convertir a float64
s = dataset_final_pycaret['LATITUD'].astype(str).str.strip()
# Normaliza separadores y limpia basura
#s = (s.str.replace('\u00A0', '', regex=False)
# .str.replace('\u202f', '', regex=False)
# .str.replace(' ', '', regex=False)
# .str.replace(',', '.', regex=False)
# .str.replace(r'[^\d\.\-]', '', regex=True))
# Convierte a número
dataset_final_pycaret['LATITUD'] = pd.to_numeric(s, errors='coerce').astype('Float64')
dataset_final_pycaret.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Int64 1 MES_NUM 4418 non-null Int64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Int64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null Float64 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Int64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Int64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Int64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Int64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Int64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Int64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Int64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Int64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Int64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Int64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(18), Int64(13), datetime64[ns](1), string(2) memory usage: 1.3 MB
from pandas.api.types import is_datetime64_any_dtype
dataset_final_pycaret = dataset_final_pycaret.copy()
# columnas que NO quieres convertir (fechas y categóricas puras)
EXCLUDE = ["FECHA", "AÑO_MES", "DISTRITO"]
# 1) Intentar pasar a número todo lo que no sea fecha ni excluido
cols_try = [c for c in dataset_final_pycaret.columns
if c not in EXCLUDE and not is_datetime64_any_dtype(dataset_final_pycaret[c])]
dataset_final_pycaret[cols_try] = dataset_final_pycaret[cols_try].apply(pd.to_numeric, errors="coerce")
# 2) Asegurar float en todas las numéricas
num_cols = dataset_final_pycaret.select_dtypes(include="number").columns
dataset_final_pycaret[num_cols] = dataset_final_pycaret[num_cols].astype("Float64")
dataset_final_pycaret.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Float64 1 MES_NUM 4418 non-null Float64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Float64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null Float64 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Float64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Float64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Float64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Float64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Float64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Float64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Float64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Float64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Float64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Float64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(31), datetime64[ns](1), string(2) memory usage: 1.3 MB
Pares de variables con multicolinealidad fuerte (|r| > 0.8).
PRECIO_VENTA_M2 (0.99)
PRECIO_ALQUILER_M2 (0.89)
PRECIO_M2_DIST (0.89)
RENTA_MEDIA_HOGAR (0.82)
RANKING ↔ RENTA_MEDIA_HOGAR (0.96) y ↔ VULNERABILIDAD (-0.97).
SHAPE_LENG ↔ SHAPE_AREA (0.96).
PRECIO_ALQUILER_M2 ↔ PRECIO_M2_DIST (1.0) Y PRECIO_VENTA_M2 (0.88)
INDICE_PRECIO ↔ IPV_VIVIENDA_NUEVA (0.96).
TOTAL_HIPOTECAS ↔ CAPITAL_TOTAL_HIPOTECAS (0.89).
POBLACION_ACTIVA ↔ IPV_VIVIENDA_NUEVA (0.89).
# Eliminar variables
cols = ["AÑO_MES", "FECHA", "VARIACIÓN MENSUAL", "VARIACIÓN TRIMESTRAL", "RANKING", "VULNERABILIDAD", "PARO_REGISTRADO", "RENTA_MEDIA_HOGAR", "CAPITAL_TOTAL_HIPOTECAS", "VALOR_TRANSACCIONES_SUELO","SHAPE_LENG", "PRECIO_VENTA_M2", "PRECIO_ALQUILER_M2","PRECIO_M2_DIST", "IPV_VIVIENDA_NUEVA","EJECUCIONES_HIPOTECARIAS"]
dataset_final_pycaret = dataset_final_pycaret.drop(columns=cols, errors="ignore")
dataset_final_pycaret.head()
| AÑO | MES_NUM | PRECIO M2 | VARIACIÓN ANUAL | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | 5729.0 | 0.181 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02081 | 6944.0 | <NA> | <NA> | 3881.9 | <NA> | <NA> | <NA> | <NA> | <NA> |
| 1 | 2025.0 | 5.0 | 5669.0 | 0.192 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.0208 | 7020.0 | <NA> | 7598.0 | 3881.9 | <NA> | <NA> | 1559.0 | <NA> | <NA> |
| 2 | 2025.0 | 4.0 | 5472.0 | 0.182 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02143 | 6604.0 | <NA> | 8425.0 | 3881.9 | <NA> | <NA> | 1499.0 | <NA> | <NA> |
| 3 | 2025.0 | 3.0 | 5341.0 | 0.163 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02398 | 7380.0 | 188.43 | 7973.0 | 3868.2 | <NA> | <NA> | 1843.0 | 436.0 | 306.36 |
| 4 | 2025.0 | 2.0 | 5310.0 | 0.162 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02402 | 6963.0 | 188.43 | 6701.0 | 3868.2 | <NA> | <NA> | 1786.0 | 436.0 | 306.36 |
# Análisis valores nulos por variable
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(dataset_final_pycaret)
print(f"Filas totales: {len(dataset_final_pycaret):,}".replace(",", "."))
res
Filas totales: 4.418
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | VARIACIÓN ANUAL | 255 | 5.77 |
| 1 | POBLACION | 124 | 2.81 |
| 2 | ESPERANZA_VIDA | 124 | 2.81 |
| 3 | PRECIO_MEDIO_M2_CCMM | 61 | 1.38 |
| 4 | INDICE_PRECIO | 61 | 1.38 |
| 5 | TRANSACCIONES_SUELO | 61 | 1.38 |
| 6 | TOTAL_HIPOTECAS | 20 | 0.45 |
| 7 | VIVIENDAS_COMPRAVENTA | 20 | 0.45 |
| 8 | MES_NUM | 0 | 0.00 |
| 9 | AÑO | 0 | 0.00 |
| 10 | TRANSACCIONES_CM | 0 | 0.00 |
| 11 | EURIBOR | 0 | 0.00 |
| 12 | SHAPE_AREA | 0 | 0.00 |
| 13 | LONGITUD | 0 | 0.00 |
| 14 | LATITUD | 0 | 0.00 |
| 15 | DISTRITO | 0 | 0.00 |
| 16 | PRECIO M2 | 0 | 0.00 |
| 17 | POBLACION_ACTIVA | 0 | 0.00 |
cat_cols1 = dataset_final_pycaret.select_dtypes(include='object').columns
num_cols1 = dataset_final_pycaret.select_dtypes(include=['int64', 'float64']).columns
datetime_cols1 = dataset_final_pycaret.select_dtypes(include=['datetime64[ns]']).columns
print("Categóricas:", cat_cols.tolist())
print("Numéricas:", num_cols.tolist())
print("Datetime:", datetime_cols.tolist())
Categóricas: ['AÑO_MES', 'VARIACIÓN MENSUAL', 'VARIACIÓN TRIMESTRAL', 'VARIACIÓN ANUAL', 'DISTRITO', 'LATITUD'] Numéricas: ['AÑO', 'MES_NUM', 'PRECIO M2', 'VARIACIÓN MENSUAL', 'VARIACIÓN TRIMESTRAL', 'VARIACIÓN ANUAL', 'LATITUD', 'LONGITUD', 'SHAPE_LENG', 'SHAPE_AREA', 'RANKING', 'VULNERABILIDAD', 'PRECIO_M2_DIST', 'EURIBOR', 'TRANSACCIONES_CM', 'INDICE_PRECIO', 'TOTAL_HIPOTECAS', 'CAPITAL_TOTAL_HIPOTECAS', 'POBLACION_ACTIVA', 'RENTA_MEDIA_HOGAR', 'POBLACION', 'ESPERANZA_VIDA', 'PARO_REGISTRADO', 'PRECIO_ALQUILER_M2', 'PRECIO_VENTA_M2', 'VIVIENDAS_COMPRAVENTA', 'EJECUCIONES_HIPOTECARIAS', 'IPV_VIVIENDA_NUEVA', 'TRANSACCIONES_SUELO', 'PRECIO_MEDIO_M2_CCMM', 'VALOR_TRANSACCIONES_SUELO'] Datetime: ['FECHA']
# Matriz de correlación con menos variables
plt.figure(figsize=(10, 6))
sns.heatmap(dataset_final_pycaret[num_cols1].corr(), annot=True, cmap='coolwarm')
plt.title("Matriz de correlación")
plt.show()
# 1) Nos quedamos solo con numéricas y quitamos constantes
num_df = dataset_final_pycaret.select_dtypes(include="number").copy()
const_cols = num_df.columns[num_df.nunique(dropna=True) <= 1]
num_df = num_df.drop(columns=const_cols)
# 2) Correlación
corr = num_df.corr(method='pearson')
# evitar NaNs en el clustermap
np.fill_diagonal(corr.values, 1.0)
corr = corr.fillna(0)
# 3) Clustermap
sns.set(font_scale=0.85)
g = sns.clustermap(
corr, cmap="RdBu_r", vmin=-1, vmax=1, center=0,
linewidths=.2, dendrogram_ratio=.12,
cbar_pos=(.02, .8, .03, .18), figsize=(11, 11)
)
# (opcional) girar etiquetas para legibilidad
g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(), rotation=90)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
plt.show()
num = (dataset_final_pycaret
.select_dtypes(include="number")
.drop(columns=["DISTRITO"], errors="ignore"))
corr = num.corr()
sns.heatmap(corr, cmap="RdBu_r", vmin=-1, vmax=1, center=0)
plt.title("Matriz de correlación")
plt.show()
dataset_final_pycaret.head()
| AÑO | MES_NUM | PRECIO M2 | VARIACIÓN ANUAL | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | 5729.0 | 0.181 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02081 | 6944.0 | <NA> | <NA> | 3881.9 | <NA> | <NA> | <NA> | <NA> | <NA> |
| 1 | 2025.0 | 5.0 | 5669.0 | 0.192 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.0208 | 7020.0 | <NA> | 7598.0 | 3881.9 | <NA> | <NA> | 1559.0 | <NA> | <NA> |
| 2 | 2025.0 | 4.0 | 5472.0 | 0.182 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02143 | 6604.0 | <NA> | 8425.0 | 3881.9 | <NA> | <NA> | 1499.0 | <NA> | <NA> |
| 3 | 2025.0 | 3.0 | 5341.0 | 0.163 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02398 | 7380.0 | 188.43 | 7973.0 | 3868.2 | <NA> | <NA> | 1843.0 | 436.0 | 306.36 |
| 4 | 2025.0 | 2.0 | 5310.0 | 0.162 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02402 | 6963.0 | 188.43 | 6701.0 | 3868.2 | <NA> | <NA> | 1786.0 | 436.0 | 306.36 |
print("Numéricas:", len(dataset_final_pycaret.select_dtypes(include='number').columns))
print("Categóricas:", len(dataset_final_pycaret.select_dtypes(include='object').columns))
Numéricas: 17 Categóricas: 0
# Lista de columnas en el orden que quieras
cols = list(dataset_final_pycaret.columns)
# Sacar las columnas que quieres reubicar
cols.remove("VARIACIÓN ANUAL")
cols.remove("PRECIO M2")
# Insertar primero VARIACIÓN ANUAL
euribor_index = cols.index("TRANSACCIONES_CM")
cols.insert(euribor_index, "VARIACIÓN ANUAL")
# Insertar después PRECIO M2
cols.insert(euribor_index + 1, "PRECIO M2")
# Reordenar el DataFrame
dataset_final_pycaret = dataset_final_pycaret[cols]
dataset_final_pycaret.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02081 | 0.181 | 5729.0 | 6944.0 | <NA> | <NA> | 3881.9 | <NA> | <NA> | <NA> | <NA> | <NA> |
| 1 | 2025.0 | 5.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.0208 | 0.192 | 5669.0 | 7020.0 | <NA> | 7598.0 | 3881.9 | <NA> | <NA> | 1559.0 | <NA> | <NA> |
| 2 | 2025.0 | 4.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02143 | 0.182 | 5472.0 | 6604.0 | <NA> | 8425.0 | 3881.9 | <NA> | <NA> | 1499.0 | <NA> | <NA> |
| 3 | 2025.0 | 3.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02398 | 0.163 | 5341.0 | 7380.0 | 188.43 | 7973.0 | 3868.2 | <NA> | <NA> | 1843.0 | 436.0 | 306.36 |
| 4 | 2025.0 | 2.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02402 | 0.162 | 5310.0 | 6963.0 | 188.43 | 6701.0 | 3868.2 | <NA> | <NA> | 1786.0 | 436.0 | 306.36 |
# Crear lag (-12) para variables a partir de EURIBOR en adelante
df_lag_pycaret = dataset_final_pycaret.copy()
# ordenar por distrito y tiempo
df_lag_pycaret['AÑO'] = df_lag_pycaret['AÑO'].astype(int)
df_lag_pycaret['MES_NUM'] = df_lag_pycaret['MES_NUM'].astype(int)
df_lag_pycaret['FECHA'] = pd.to_datetime(df_lag_pycaret['AÑO'].astype(str) + '-' + df_lag_pycaret['MES_NUM'].astype(str) + '-01')
df_lag_pycaret = df_lag_pycaret.sort_values(['DISTRITO', 'FECHA'])
# columnas desde 'EURIBOR' hasta el final del DataFrame
start_idx = df_lag_pycaret.columns.get_loc('EURIBOR')
cols_desde_euribor = df_lag_pycaret.columns[start_idx:]
# nos quedamos solo con las numéricas dentro de ese tramo para laggear
cols_lag = [c for c in cols_desde_euribor if pd.api.types.is_numeric_dtype(df_lag_pycaret[c])]
# crear lag 12 por DISTRITO
for c in cols_lag:
df_lag_pycaret[f'{c}_lag12'] = df_lag_pycaret.groupby('DISTRITO')[c].shift(12)
# eliminar columna temporal de fecha
df_lag_pycaret = df_lag_pycaret.drop(columns=['FECHA'])
# df_lag = df_lag.drop(columns='FECHA') # opcional
df_lag_pycaret.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | ... | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | INDICE_PRECIO_lag12 | TOTAL_HIPOTECAS_lag12 | POBLACION_ACTIVA_lag12 | POBLACION_lag12 | ESPERANZA_VIDA_lag12 | VIVIENDAS_COMPRAVENTA_lag12 | TRANSACCIONES_SUELO_lag12 | PRECIO_MEDIO_M2_CCMM_lag12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211 | 2007 | 11 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.04607 | <NA> | 4000.0 | 6058.0 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 210 | 2007 | 12 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.04793 | <NA> | 4000.0 | 4577.0 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 209 | 2008 | 1 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.04498 | <NA> | 4000.0 | 6123.0 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 208 | 2008 | 2 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.04349 | <NA> | 3824.0 | 5964.0 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 207 | 2008 | 3 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.0459 | <NA> | 4000.0 | 5525.0 | ... | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
5 rows × 30 columns
# Eliminar columnas que no son de lag
# Lista de columnas originales que quieres eliminar
cols_drop = [
'EURIBOR',
'VARIACIÓN ANUAL',
'TRANSACCIONES_CM',
'INDICE_PRECIO',
'TOTAL_HIPOTECAS',
'POBLACION_ACTIVA',
'POBLACION',
'ESPERANZA_VIDA',
'VIVIENDAS_COMPRAVENTA',
'TRANSACCIONES_SUELO',
'PRECIO_MEDIO_M2_CCMM'
]
# Eliminarlas si existen en df_lag
df_lag_pycaret = df_lag_pycaret.drop(columns=[c for c in cols_drop if c in df_lag_pycaret.columns])
df_lag_pycaret.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | PRECIO M2 | EURIBOR_lag12 | VARIACIÓN ANUAL_lag12 | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | INDICE_PRECIO_lag12 | TOTAL_HIPOTECAS_lag12 | POBLACION_ACTIVA_lag12 | POBLACION_lag12 | ESPERANZA_VIDA_lag12 | VIVIENDAS_COMPRAVENTA_lag12 | TRANSACCIONES_SUELO_lag12 | PRECIO_MEDIO_M2_CCMM_lag12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211 | 2007 | 11 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 4000.0 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 210 | 2007 | 12 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 4000.0 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 209 | 2008 | 1 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 4000.0 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 208 | 2008 | 2 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3824.0 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
| 207 | 2008 | 3 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 4000.0 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> |
print("Numéricas:", len(df_lag_pycaret.select_dtypes(include='number').columns))
print("Categóricas:", len(df_lag_pycaret.select_dtypes(include='object').columns))
Numéricas: 18 Categóricas: 0
from pandas.api.types import is_datetime64_any_dtype
df_lag_pycaret = df_lag_pycaret.copy()
# columnas que NO quieres convertir (fechas y categóricas puras)
EXCLUDE = ["DISTRITO"]
# 1) Intentar pasar a número todo lo que no sea fecha ni excluido
cols_try = [c for c in df_lag_pycaret.columns
if c not in EXCLUDE and not is_datetime64_any_dtype(df_lag_pycaret[c])]
df_lag_pycaret[cols_try] = df_lag_pycaret[cols_try].apply(pd.to_numeric, errors="coerce")
# 2) Asegurar float en todas las numéricas
num_cols = df_lag_pycaret.select_dtypes(include="number").columns
df_lag_pycaret[num_cols] = df_lag_pycaret[num_cols].astype("Float64")
df_lag_pycaret.info()
<class 'pandas.core.frame.DataFrame'> Index: 4418 entries, 211 to 4210 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Float64 1 MES_NUM 4418 non-null Float64 2 DISTRITO 4418 non-null string 3 LATITUD 4418 non-null Float64 4 LONGITUD 4418 non-null Float64 5 SHAPE_AREA 4418 non-null Float64 6 PRECIO M2 4418 non-null Float64 7 EURIBOR_lag12 4166 non-null Float64 8 VARIACIÓN ANUAL_lag12 3911 non-null Float64 9 PRECIO M2_lag12 4166 non-null Float64 10 TRANSACCIONES_CM_lag12 4166 non-null Float64 11 INDICE_PRECIO_lag12 4166 non-null Float64 12 TOTAL_HIPOTECAS_lag12 4166 non-null Float64 13 POBLACION_ACTIVA_lag12 4166 non-null Float64 14 POBLACION_lag12 4166 non-null Float64 15 ESPERANZA_VIDA_lag12 4166 non-null Float64 16 VIVIENDAS_COMPRAVENTA_lag12 4166 non-null Float64 17 TRANSACCIONES_SUELO_lag12 4166 non-null Float64 18 PRECIO_MEDIO_M2_CCMM_lag12 4166 non-null Float64 dtypes: Float64(18), string(1) memory usage: 768.0 KB
# Dataset sin Lag
dataset_final_pycaret.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\dataset_final_pycaret.xlsx", index=False)
# Dataset con Lag
df_lag_pycaret.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\df_lag_pycaret.xlsx", index=False)
# Convierte celdas vacías y variantes de "n.d." en NaN
# Vacíos y "n.d." (con o sin puntos/espacios, mayúsculas/minúsculas)
pd.set_option('future.no_silent_downcasting', True)
patrones = {r'^\s*$': np.nan, r'(?i)^(?:n\.?\s*d\.?)$': np.nan}
dataset_entrenamiento = df_final.replace(patrones, regex=True)
# Si quieres tipos “bonitos” (Int64/Float64/string/boolean):
dataset_entrenamiento = dataset_entrenamiento.convert_dtypes()
dataset_entrenamiento.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Int64 1 MES_NUM 4418 non-null Int64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Int64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null string 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Int64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Int64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Int64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Int64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Int64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Int64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Int64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Int64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Int64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Int64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(17), Int64(13), datetime64[ns](1), string(3) memory usage: 1.3 MB
## Latitud a Float
s = dataset_entrenamiento['LATITUD'].astype(str).str.strip()
# Normaliza separadores y limpia basura
#s = (s.str.replace('\u00A0', '', regex=False) # espacio no separable
# .str.replace('\u202f', '', regex=False) # espacio fino
# .str.replace(' ', '', regex=False)
# .str.replace(',', '.', regex=False) # coma decimal → punto
# .str.replace(r'[^\d\.\-]', '', regex=True)) # deja solo dígitos/.-
# Convierte a número
dataset_entrenamiento['LATITUD'] = pd.to_numeric(s, errors='coerce').astype('Float64')
dataset_entrenamiento.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Int64 1 MES_NUM 4418 non-null Int64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Int64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null Float64 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Int64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Int64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Int64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Int64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Int64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Int64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Int64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Int64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Int64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Int64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(18), Int64(13), datetime64[ns](1), string(2) memory usage: 1.3 MB
from pandas.api.types import is_datetime64_any_dtype
dataset_entrenamiento = dataset_entrenamiento.copy()
# columnas que NO quieres convertir (fechas y categóricas puras)
EXCLUDE = ["FECHA", "AÑO_MES", "DISTRITO"]
# 1) Intentar pasar a número todo lo que no sea fecha ni excluido
cols_try = [c for c in dataset_entrenamiento.columns
if c not in EXCLUDE and not is_datetime64_any_dtype(dataset_entrenamiento[c])]
dataset_entrenamiento[cols_try] = dataset_entrenamiento[cols_try].apply(pd.to_numeric, errors="coerce")
# 2) Asegurar float en todas las numéricas
num_cols = dataset_entrenamiento.select_dtypes(include="number").columns
dataset_entrenamiento[num_cols] = dataset_entrenamiento[num_cols].astype("Float64")
dataset_entrenamiento.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Float64 1 MES_NUM 4418 non-null Float64 2 AÑO_MES 4418 non-null string 3 FECHA 4418 non-null datetime64[ns] 4 PRECIO M2 4418 non-null Float64 5 VARIACIÓN MENSUAL 4380 non-null Float64 6 VARIACIÓN TRIMESTRAL 4350 non-null Float64 7 VARIACIÓN ANUAL 4163 non-null Float64 8 DISTRITO 4418 non-null string 9 LATITUD 4418 non-null Float64 10 LONGITUD 4418 non-null Float64 11 SHAPE_LENG 4418 non-null Float64 12 SHAPE_AREA 4418 non-null Float64 13 RANKING 756 non-null Float64 14 VULNERABILIDAD 756 non-null Float64 15 PRECIO_M2_DIST 3789 non-null Float64 16 EURIBOR 4418 non-null Float64 17 TRANSACCIONES_CM 4418 non-null Float64 18 INDICE_PRECIO 4357 non-null Float64 19 TOTAL_HIPOTECAS 4398 non-null Float64 20 CAPITAL_TOTAL_HIPOTECAS 4398 non-null Float64 21 POBLACION_ACTIVA 4418 non-null Float64 22 RENTA_MEDIA_HOGAR 1920 non-null Float64 23 POBLACION 4294 non-null Float64 24 ESPERANZA_VIDA 4294 non-null Float64 25 PARO_REGISTRADO 1554 non-null Float64 26 PRECIO_ALQUILER_M2 3789 non-null Float64 27 PRECIO_VENTA_M2 4168 non-null Float64 28 VIVIENDAS_COMPRAVENTA 4398 non-null Float64 29 EJECUCIONES_HIPOTECARIAS 2835 non-null Float64 30 IPV_VIVIENDA_NUEVA 4357 non-null Float64 31 TRANSACCIONES_SUELO 4357 non-null Float64 32 PRECIO_MEDIO_M2_CCMM 4357 non-null Float64 33 VALOR_TRANSACCIONES_SUELO 4357 non-null Float64 dtypes: Float64(31), datetime64[ns](1), string(2) memory usage: 1.3 MB
# Eliminar variables
cols = ["AÑO_MES", "FECHA", "VARIACIÓN MENSUAL", "VARIACIÓN TRIMESTRAL", "RANKING", "VULNERABILIDAD", "PARO_REGISTRADO", "RENTA_MEDIA_HOGAR", "CAPITAL_TOTAL_HIPOTECAS", "VALOR_TRANSACCIONES_SUELO","SHAPE_LENG", "PRECIO_VENTA_M2", "PRECIO_ALQUILER_M2","PRECIO_M2_DIST", "IPV_VIVIENDA_NUEVA","EJECUCIONES_HIPOTECARIAS"]
dataset_entrenamiento = dataset_entrenamiento.drop(columns=cols, errors="ignore")
dataset_entrenamiento.head()
| AÑO | MES_NUM | PRECIO M2 | VARIACIÓN ANUAL | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | 5729.0 | 0.181 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02081 | 6944.0 | <NA> | <NA> | 3881.9 | <NA> | <NA> | <NA> | <NA> | <NA> |
| 1 | 2025.0 | 5.0 | 5669.0 | 0.192 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.0208 | 7020.0 | <NA> | 7598.0 | 3881.9 | <NA> | <NA> | 1559.0 | <NA> | <NA> |
| 2 | 2025.0 | 4.0 | 5472.0 | 0.182 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02143 | 6604.0 | <NA> | 8425.0 | 3881.9 | <NA> | <NA> | 1499.0 | <NA> | <NA> |
| 3 | 2025.0 | 3.0 | 5341.0 | 0.163 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02398 | 7380.0 | 188.43 | 7973.0 | 3868.2 | <NA> | <NA> | 1843.0 | 436.0 | 306.36 |
| 4 | 2025.0 | 2.0 | 5310.0 | 0.162 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 0.02402 | 6963.0 | 188.43 | 6701.0 | 3868.2 | <NA> | <NA> | 1786.0 | 436.0 | 306.36 |
print("Numéricas:", len(dataset_entrenamiento.select_dtypes(include='number').columns))
print("Categóricas:", len(dataset_entrenamiento.select_dtypes(include='object').columns))
Numéricas: 17 Categóricas: 0
# Análisis valores nulos por variable
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(dataset_entrenamiento)
print(f"Filas totales: {len(dataset_entrenamiento):,}".replace(",", "."))
res
Filas totales: 4.418
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | VARIACIÓN ANUAL | 255 | 5.77 |
| 1 | POBLACION | 124 | 2.81 |
| 2 | ESPERANZA_VIDA | 124 | 2.81 |
| 3 | PRECIO_MEDIO_M2_CCMM | 61 | 1.38 |
| 4 | INDICE_PRECIO | 61 | 1.38 |
| 5 | TRANSACCIONES_SUELO | 61 | 1.38 |
| 6 | TOTAL_HIPOTECAS | 20 | 0.45 |
| 7 | VIVIENDAS_COMPRAVENTA | 20 | 0.45 |
| 8 | MES_NUM | 0 | 0.00 |
| 9 | AÑO | 0 | 0.00 |
| 10 | TRANSACCIONES_CM | 0 | 0.00 |
| 11 | EURIBOR | 0 | 0.00 |
| 12 | SHAPE_AREA | 0 | 0.00 |
| 13 | LONGITUD | 0 | 0.00 |
| 14 | LATITUD | 0 | 0.00 |
| 15 | DISTRITO | 0 | 0.00 |
| 16 | PRECIO M2 | 0 | 0.00 |
| 17 | POBLACION_ACTIVA | 0 | 0.00 |
# Dataset antes de imputar por mediana
dataset_entrenamiento.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\df_previo_imputacion.xlsx", index=False)
# Imputar vulnerabilidad por media por cada distrito # Supongamos que ya tienes cargado tu dataset # dataset_entrenamiento = pd.read_excel("dataset_entrenamiento.xlsx") # Reemplazar los valores "pasados" a NaN (si vienen con otro símbolo, ej. comas) dataset_entrenamiento["VULNERABILIDAD"] = pd.to_numeric( dataset_entrenamiento["VULNERABILIDAD"], errors="coerce" ) # Imputar VULNERABILIDAD con la media por DISTRITO dataset_entrenamiento["VULNERABILIDAD"] = dataset_entrenamiento.groupby("DISTRITO")["VULNERABILIDAD"]\ .transform(lambda x: x.fillna(x.mean())) # Verificar si quedan NaN print(dataset_entrenamiento["VULNERABILIDAD"].isna().sum())
## Imputar todos los valores nulos por la mediana por cada distrito import pandas as pd # Asegurar que todas las columnas numéricas están en formato numérico num_cols = dataset_entrenamiento.select_dtypes(include="number").columns # Para cada columna numérica, imputar los NaN con la mediana del distrito for col in num_cols: dataset_entrenamiento[col] = dataset_entrenamiento.groupby("DISTRITO")[col]\ .transform(lambda x: x.fillna(x.median())) # Verificar si quedan NaN print(dataset_entrenamiento[num_cols].isna().sum())
Se imputa por valores únicos porque:
Algunos de los datos originales eran anuales, no mensuales.
Repetirlos a nivel mensual fue solo un paso de integración, no un dato real.
Así que imputar sobre filas repetidas es “dar demasiado peso” a datos inventados.
# Imputar por mediana usando valores únicos para evitar sesgo por duplicados
dataset_entrenamiento = dataset_entrenamiento.copy()
num_cols = dataset_entrenamiento.select_dtypes(include="number").columns
for col in num_cols:
# mediana por distrito calculada sobre valores únicos de esa variable
mediana_unicos = (
dataset_entrenamiento.groupby("DISTRITO")[col]
.apply(lambda s: s.drop_duplicates().median())
)
# imputar sin eliminar filas en el df original
dataset_entrenamiento[col] = dataset_entrenamiento[col].fillna(df["DISTRITO"].map(mediana_unicos))
# Análisis valores nulos por variable
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(dataset_entrenamiento)
print(f"Filas totales: {len(dataset_entrenamiento):,}".replace(",", "."))
res
Filas totales: 4.418
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | AÑO | 0 | 0.0 |
| 1 | MES_NUM | 0 | 0.0 |
| 2 | PRECIO M2 | 0 | 0.0 |
| 3 | VARIACIÓN ANUAL | 0 | 0.0 |
| 4 | DISTRITO | 0 | 0.0 |
| 5 | LATITUD | 0 | 0.0 |
| 6 | LONGITUD | 0 | 0.0 |
| 7 | SHAPE_AREA | 0 | 0.0 |
| 8 | EURIBOR | 0 | 0.0 |
| 9 | TRANSACCIONES_CM | 0 | 0.0 |
| 10 | INDICE_PRECIO | 0 | 0.0 |
| 11 | TOTAL_HIPOTECAS | 0 | 0.0 |
| 12 | POBLACION_ACTIVA | 0 | 0.0 |
| 13 | POBLACION | 0 | 0.0 |
| 14 | ESPERANZA_VIDA | 0 | 0.0 |
| 15 | VIVIENDAS_COMPRAVENTA | 0 | 0.0 |
| 16 | TRANSACCIONES_SUELO | 0 | 0.0 |
| 17 | PRECIO_MEDIO_M2_CCMM | 0 | 0.0 |
dataset_entrenamiento.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Float64 1 MES_NUM 4418 non-null Float64 2 PRECIO M2 4418 non-null Float64 3 VARIACIÓN ANUAL 4418 non-null Float64 4 DISTRITO 4418 non-null string 5 LATITUD 4418 non-null Float64 6 LONGITUD 4418 non-null Float64 7 SHAPE_AREA 4418 non-null Float64 8 EURIBOR 4418 non-null Float64 9 TRANSACCIONES_CM 4418 non-null Float64 10 INDICE_PRECIO 4418 non-null Float64 11 TOTAL_HIPOTECAS 4418 non-null Float64 12 POBLACION_ACTIVA 4418 non-null Float64 13 POBLACION 4418 non-null Float64 14 ESPERANZA_VIDA 4418 non-null Float64 15 VIVIENDAS_COMPRAVENTA 4418 non-null Float64 16 TRANSACCIONES_SUELO 4418 non-null Float64 17 PRECIO_MEDIO_M2_CCMM 4418 non-null Float64 dtypes: Float64(17), string(1) memory usage: 694.8 KB
# columnas numéricas que quieres usar
desired_cols = [
'PRECIO M2', 'VARIACIÓN ANUAL', 'LATITUD', 'LONGITUD', 'SHAPE_AREA',
'EURIBOR', 'TRANSACCIONES_CM', 'INDICE_PRECIO', 'TOTAL_HIPOTECAS',
'POBLACION_ACTIVA', 'POBLACION', 'ESPERANZA_VIDA',
'VIVIENDAS_COMPRAVENTA', 'TRANSACCIONES_SUELO', 'PRECIO_MEDIO_M2_CCMM'
]
# deja solo las que existen en el DataFrame (manteniendo el orden)
num_cols3 = [c for c in desired_cols if c in dataset_entrenamiento.columns]
print("num_cols3:", num_cols3)
num_cols3: ['PRECIO M2', 'VARIACIÓN ANUAL', 'LATITUD', 'LONGITUD', 'SHAPE_AREA', 'EURIBOR', 'TRANSACCIONES_CM', 'INDICE_PRECIO', 'TOTAL_HIPOTECAS', 'POBLACION_ACTIVA', 'POBLACION', 'ESPERANZA_VIDA', 'VIVIENDAS_COMPRAVENTA', 'TRANSACCIONES_SUELO', 'PRECIO_MEDIO_M2_CCMM']
cat_cols3 = dataset_entrenamiento.select_dtypes(include='object').columns
#num_cols3 = dataset_entrenamiento.select_dtypes(include=['int64', 'float64']).columns
datetime_cols3 = dataset_entrenamiento.select_dtypes(include=['datetime64[ns]']).columns
print("Categóricas:", cat_cols3.tolist())
#print("Numéricas:", num_cols3.tolist())
print("Datetime:", datetime_cols3.tolist())
Categóricas: [] Datetime: []
for col in num_cols3:
plt.figure()
sns.boxplot(data=dataset_entrenamiento[col])
plt.title(f'Outliers en {col}')
plt.show()
# Definir la variable objetivo
target = "PRECIO M2"
# --- Winsorización ---
for col in num_cols3:
if col != target: # Evita winsorizar la variable objetivo
# q_low = dataset_entrenamiento[col].quantile(0.01) # percentil 1
# q_high = dataset_entrenamiento[col].quantile(0.99) # percentil 99
q_low = dataset_entrenamiento[col].quantile(0.05)
q_high = dataset_entrenamiento[col].quantile(0.95)
dataset_entrenamiento[col] = np.where(dataset_entrenamiento[col] < q_low, q_low,
np.where(dataset_entrenamiento[col] > q_high, q_high, dataset_entrenamiento[col]))
# --- Gráficos para comprobar ---
for col in num_cols3:
plt.figure(figsize=(6,4))
sns.boxplot(x=dataset_entrenamiento[col])
plt.title(f'Outliers después de winsorizar: {col}')
plt.show()
## Contar número de outliers
outlier_counts = {}
for col in num_cols3:
# Solo procesar columnas numéricas y evitar el target si quieres
if pd.api.types.is_numeric_dtype(dataset_entrenamiento[col]) and col != "DISTRITO":
Q1 = dataset_entrenamiento[col].quantile(0.25)
Q3 = dataset_entrenamiento[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Contar outliers
outliers = df_final[(dataset_entrenamiento[col] < lower_bound) | (dataset_entrenamiento[col] > upper_bound)][col]
outlier_counts[col] = outliers.count()
# Convertir a DataFrame ordenado
outlier_summary = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["Nº Outliers"])
outlier_summary = outlier_summary.sort_values(by="Nº Outliers", ascending=False)
print(outlier_summary)
Nº Outliers PRECIO_MEDIO_M2_CCMM 466 PRECIO M2 88 VARIACIÓN ANUAL 0 LONGITUD 0 SHAPE_AREA 0 EURIBOR 0 LATITUD 0 TRANSACCIONES_CM 0 INDICE_PRECIO 0 POBLACION_ACTIVA 0 TOTAL_HIPOTECAS 0 POBLACION 0 ESPERANZA_VIDA 0 VIVIENDAS_COMPRAVENTA 0 TRANSACCIONES_SUELO 0
dataset_entrenamiento["PRECIO_MEDIO_M2_CCMM"] = np.log1p(dataset_entrenamiento["PRECIO_MEDIO_M2_CCMM"])
outlier_counts = {}
for col in dataset_entrenamiento.select_dtypes(include=["number"]).columns:
Q1 = dataset_entrenamiento[col].quantile(0.25)
Q3 = dataset_entrenamiento[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Contar outliers
outliers = dataset_entrenamiento[(dataset_entrenamiento[col] < lower_bound) |
(dataset_entrenamiento[col] > upper_bound)][col]
outlier_counts[col] = outliers.count()
# Resumen ordenado
outlier_summary = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["Nº Outliers"])
outlier_summary = outlier_summary.sort_values(by="Nº Outliers", ascending=False)
print(outlier_summary)
Nº Outliers PRECIO M2 88 AÑO 0 MES_NUM 0 VARIACIÓN ANUAL 0 LATITUD 0 LONGITUD 0 SHAPE_AREA 0 EURIBOR 0 TRANSACCIONES_CM 0 INDICE_PRECIO 0 TOTAL_HIPOTECAS 0 POBLACION_ACTIVA 0 POBLACION 0 ESPERANZA_VIDA 0 VIVIENDAS_COMPRAVENTA 0 TRANSACCIONES_SUELO 0 PRECIO_MEDIO_M2_CCMM 0
# Dataset después de imputar por mediana
dataset_entrenamiento.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\df_despues_imputacion.xlsx", index=False)
dataset_entrenamiento.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4418 entries, 0 to 4417 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4418 non-null Float64 1 MES_NUM 4418 non-null Float64 2 PRECIO M2 4418 non-null Float64 3 VARIACIÓN ANUAL 4418 non-null float64 4 DISTRITO 4418 non-null string 5 LATITUD 4418 non-null float64 6 LONGITUD 4418 non-null float64 7 SHAPE_AREA 4418 non-null float64 8 EURIBOR 4418 non-null float64 9 TRANSACCIONES_CM 4418 non-null float64 10 INDICE_PRECIO 4418 non-null float64 11 TOTAL_HIPOTECAS 4418 non-null float64 12 POBLACION_ACTIVA 4418 non-null float64 13 POBLACION 4418 non-null float64 14 ESPERANZA_VIDA 4418 non-null float64 15 VIVIENDAS_COMPRAVENTA 4418 non-null float64 16 TRANSACCIONES_SUELO 4418 non-null float64 17 PRECIO_MEDIO_M2_CCMM 4418 non-null float64 dtypes: Float64(3), float64(14), string(1) memory usage: 634.4 KB
# Lista de columnas en el orden que quieras
cols = list(dataset_entrenamiento.columns)
# Sacar las columnas que quieres reubicar
cols.remove("VARIACIÓN ANUAL")
cols.remove("PRECIO M2")
# Insertar primero VARIACIÓN ANUAL antes de 'TRANSACCIONES_CM'
euribor_index = cols.index("TRANSACCIONES_CM")
cols.insert(euribor_index, "VARIACIÓN ANUAL")
# Insertar después PRECIO M2
cols.insert(euribor_index + 1, "PRECIO M2")
# Reordenar el DataFrame
dataset_entrenamiento = dataset_entrenamiento[cols]
dataset_entrenamiento
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02081 | 0.1650 | 5729.0 | 6944.0 | 134.735 | 6964.0 | 3784.3 | 154118.0 | 84.95 | 1494.0 | 363.0 | 5.745180 |
| 1 | 2025.0 | 5.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02080 | 0.1650 | 5669.0 | 7020.0 | 134.735 | 7598.0 | 3784.3 | 154118.0 | 84.95 | 1559.0 | 363.0 | 5.745180 |
| 2 | 2025.0 | 4.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02143 | 0.1650 | 5472.0 | 6604.0 | 134.735 | 8425.0 | 3784.3 | 154118.0 | 84.95 | 1499.0 | 363.0 | 5.745180 |
| 3 | 2025.0 | 3.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02398 | 0.1630 | 5341.0 | 7380.0 | 174.540 | 7973.0 | 3784.3 | 154118.0 | 84.95 | 1843.0 | 436.0 | 5.728020 |
| 4 | 2025.0 | 2.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02402 | 0.1620 | 5310.0 | 6963.0 | 174.540 | 6701.0 | 3784.3 | 154118.0 | 84.95 | 1786.0 | 436.0 | 5.728020 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4413 | 2008.0 | 5.0 | Villaverde | 40.37400 | -3.70000 | 2.019432e+07 | 0.04498 | 0.0075 | 2745.0 | 5107.0 | 153.000 | 12744.0 | 3425.8 | 149732.0 | 82.20 | 2237.0 | 393.0 | 6.252828 |
| 4414 | 2008.0 | 4.0 | Villaverde | 40.37400 | -3.70000 | 2.019432e+07 | 0.04498 | 0.0075 | 2806.0 | 6089.0 | 153.000 | 12744.0 | 3425.8 | 149767.0 | 82.20 | 2669.0 | 393.0 | 6.252828 |
| 4415 | 2008.0 | 3.0 | Villaverde | 40.37400 | -3.70000 | 2.019432e+07 | 0.04498 | 0.0075 | 2899.0 | 5525.0 | 152.900 | 12024.0 | 3375.6 | 150043.0 | 82.20 | 2507.0 | 458.0 | 6.271159 |
| 4416 | 2008.0 | 2.0 | Villaverde | 40.37400 | -3.70000 | 2.019432e+07 | 0.04349 | 0.0075 | 2900.0 | 5964.0 | 152.900 | 12744.0 | 3375.6 | 149306.0 | 82.20 | 2347.0 | 458.0 | 6.271159 |
| 4417 | 2008.0 | 1.0 | Villaverde | 40.37400 | -3.70000 | 2.019432e+07 | 0.04498 | 0.0075 | 2801.0 | 6123.0 | 152.900 | 12744.0 | 3375.6 | 149067.0 | 82.20 | 2278.0 | 458.0 | 6.271159 |
4418 rows × 18 columns
# ---------------------------- # PARÁMETROS # ---------------------------- start = '2025-07' # horizonte de predicción: inicio end = '2026-06' # horizonte de predicción: fin target_col = 'PRECIO M2' district_col = 'DISTRITO' year_col = 'AÑO' month_col = 'MES_NUM' # Solo lag de 12 meses LAGS = [12] # Lista de distritos (para OHE consistente) distritos = [ 'Arganzuela','Barajas','Carabanchel','Centro','Chamartin','Chamberi','Ciudad Lineal', 'Fuencarral-El Pardo','Hortaleza','Latina','Moncloa-Aravaca','Moratalaz','Puente De Vallecas', 'Retiro','Salamanca','San Blas-Canillejas','Tetuan','Usera','Vicalvaro','Villa De Vallecas', 'Villaverde' ] # --------------------------------------------------------- # 1) Copia histórico y asegura tipos/orden temporal # --------------------------------------------------------- df = dataset_entrenamiento.copy() df[year_col] = df[year_col].astype(int) df[month_col] = df[month_col].astype(int) df['FECHA'] = pd.to_datetime(df[year_col].astype(str) + '-' + df[month_col].astype(str) + '-01') df = df.sort_values([district_col, 'FECHA']).reset_index(drop=True) # --------------------------------------------------------- # 2) Columnas para lag DESDE 'EURIBOR' EN ADELANTE (numéricas) # --------------------------------------------------------- if 'EURIBOR' not in df.columns: raise KeyError("No encuentro la columna 'VARIACIÓN ANUAL' en el DataFrame.") idx_ini = df.columns.get_loc('EURIBOR') candidatas = df.columns[idx_ini:] # desde esa columna hacia la derecha num_cols = df.select_dtypes(include=[np.number]).columns cols_para_lag = [c for c in candidatas if c in num_cols] # mismas columnas y orden # --------------------------------------------------------- # 3) Crea filas FUTURAS por distrito (solo fechas) # --------------------------------------------------------- future_months = pd.period_range(start=start, end=end, freq='M').to_timestamp() future_rows = ( pd.MultiIndex.from_product([distritos, future_months], names=[district_col, 'FECHA']) .to_frame(index=False) ) future_rows[year_col] = future_rows['FECHA'].dt.year future_rows[month_col] = future_rows['FECHA'].dt.month # Añade columnas del histórico con NaN respetando el orden for c in df.columns: if c not in future_rows.columns: future_rows[c] = np.nan future_rows = future_rows[df.columns] # --------------------------------------------------------- # 4) Histórico + futuro y creación del lag12 por DISTRITO # --------------------------------------------------------- full = pd.concat([df, future_rows], ignore_index=True).sort_values([district_col, 'FECHA']) for col in cols_para_lag: full[f'{col}_lag12'] = full.groupby(district_col)[col].shift(12) # --------------------------------------------------------- # 5) Relleno en tramo de predicción con lag12 y columnas estáticas # --------------------------------------------------------- start_dt, end_dt = pd.to_datetime(start), pd.to_datetime(end) mask_future = (full['FECHA'] >= start_dt) & (full['FECHA'] <= end_dt) # columnas estáticas por distrito (ajústalas si tienes más) cols_estaticas = [c for c in ['LATITUD','LONGITUD','SHAPE_AREA'] if c in full.columns] for c in cols_estaticas: full[c] = full.groupby(district_col)[c].ffill().bfill() # Copiar el valor del lag12 si en el futuro está NaN (solo para las columnas a laggear) for c in cols_para_lag: lag12 = f'{c}_lag12' full.loc[mask_future, c] = full.loc[mask_future, c].where( full.loc[mask_future, c].notna(), full.loc[mask_future, lag12] ) # El TARGET debe quedar en NaN en el tramo futuro (lo predecirá el modelo) if target_col in full.columns: full.loc[mask_future, target_col] = np.nan # --------------------------------------------------------- # 6) One-Hot de distritos (prefijo DIST_) y garantía de columnas # --------------------------------------------------------- full = pd.get_dummies(full, columns=[district_col], prefix='DIST', dtype=bool) for d in distritos: colname = f'DIST_{d}' if colname not in full.columns: full[colname] = False # --------------------------------------------------------- # 7) Extrae el DF de PREDICCIÓN y ordena columnas # - Primero TODAS las columnas originales (orden original) # - Después TODAS las columnas lag12 en el mismo orden de sus originales # - Finalmente las columnas OHE (DIST_*) # --------------------------------------------------------- dataset_prediccion = full.loc[mask_future].copy() # columnas originales (del histórico) que están en dataset_prediccion orig_cols = [c for c in df.columns if c in dataset_prediccion.columns] # columnas lag en el mismo orden (solo las existentes) lag_cols = [f'{c}_lag12' for c in orig_cols if f'{c}_lag12' in dataset_prediccion.columns] # columnas OHE dist_cols = sorted([c for c in dataset_prediccion.columns if c.startswith('DIST_')]) # orden final final_cols = orig_cols + lag_cols + dist_cols dataset_prediccion = dataset_prediccion[final_cols] print(dataset_prediccion.shape) dataset_prediccion.head(10)
## DEFINITIVO
# ----------------------------
# PARÁMETROS
# ----------------------------
start = '2025-07' # horizonte de predicción: inicio
end = '2026-06' # horizonte de predicción: fin
target_col = 'PRECIO M2'
district_col = 'DISTRITO'
year_col = 'AÑO'
month_col = 'MES_NUM'
# Solo lag de 12 meses
LAGS = [12]
# Lista de distritos (para OHE consistente)
distritos = [
'Arganzuela','Barajas','Carabanchel','Centro','Chamartin','Chamberi','Ciudad Lineal',
'Fuencarral-El Pardo','Hortaleza','Latina','Moncloa-Aravaca','Moratalaz','Puente De Vallecas',
'Retiro','Salamanca','San Blas-Canillejas','Tetuan','Usera','Vicalvaro','Villa De Vallecas',
'Villaverde'
]
# ---------------------------------------------------------
# 1) Copia histórico y asegura tipos/orden temporal
# ---------------------------------------------------------
df = dataset_entrenamiento.copy()
df[year_col] = df[year_col].astype(int)
df[month_col] = df[month_col].astype(int)
# FECHA al primer día de mes para tener una referencia clara mensual
df['FECHA'] = pd.to_datetime(df[year_col].astype(str) + '-' + df[month_col].astype(str) + '-01')
df = df.sort_values([district_col, 'FECHA']).reset_index(drop=True)
# ---------------------------------------------------------
# 2) Columnas para lag DESDE 'EURIBOR' EN ADELANTE (numéricas)
# ---------------------------------------------------------
if 'EURIBOR' not in df.columns:
raise KeyError("No encuentro la columna 'EURIBOR' en el DataFrame.")
idx_ini = df.columns.get_loc('EURIBOR')
candidatas = df.columns[idx_ini:] # desde EURIBOR hacia la derecha
num_cols = df.select_dtypes(include=[np.number]).columns
cols_para_lag = [c for c in candidatas if c in num_cols] # mismas columnas y orden
# ---------------------------------------------------------
# 3) Panel mensual completo por distrito (histórico mínimo -> end)
# Evita desalineaciones del shift(12) si faltan meses en el histórico
# ---------------------------------------------------------
hist_min = df['FECHA'].min()
end_dt = pd.to_datetime(end)
# Periodos mensuales y a Timestamp en INICIO de mes
cal = pd.period_range(start=hist_min.to_period('M'),
end=end_dt.to_period('M'),
freq='M').to_timestamp(how='start')
panel = (
pd.MultiIndex.from_product([distritos, cal], names=[district_col, 'FECHA'])
.to_frame(index=False)
)
panel[year_col] = panel['FECHA'].dt.year
panel[month_col] = panel['FECHA'].dt.month
# Merge del histórico sobre el panel (rellena huecos con NaN)
full = panel.merge(
df,
on=[district_col, 'FECHA', year_col, month_col],
how='left',
suffixes=('', '_dup')
)
# ---------------------------------------------------------
# 4) Crear lag12 por DISTRITO (alineado por mes)
# ---------------------------------------------------------
full = full.sort_values([district_col, 'FECHA']).reset_index(drop=True)
for col in cols_para_lag:
full[f'{col}_lag12'] = full.groupby(district_col, group_keys=False)[col].shift(12)
# ---------------------------------------------------------
# 5) Relleno en tramo de predicción con lag12 y columnas estáticas
# ---------------------------------------------------------
start_dt = pd.to_datetime(start)
mask_future = (full['FECHA'] >= start_dt) & (full['FECHA'] <= end_dt)
# columnas estáticas por distrito (ajústalas si tienes más)
cols_estaticas = [c for c in ['LATITUD','LONGITUD','SHAPE_AREA'] if c in full.columns]
for c in cols_estaticas:
full[c] = full.groupby(district_col, group_keys=False)[c].ffill().bfill()
# Copiar SIEMPRE el valor del lag12 en el futuro (desde EURIBOR en adelante)
for c in cols_para_lag:
lag12 = f'{c}_lag12'
full.loc[mask_future, c] = full.loc[mask_future, lag12]
# El TARGET debe quedar en NaN en el tramo futuro (lo predecirá el modelo)
if target_col in full.columns:
full.loc[mask_future, target_col] = np.nan
# ---------------------------------------------------------
# 6) One-Hot de distritos (prefijo DIST_) y garantía de columnas
# ---------------------------------------------------------
full = pd.get_dummies(full, columns=[district_col], prefix='DIST', dtype=bool)
for d in distritos:
colname = f'DIST_{d}'
if colname not in full.columns:
full[colname] = False
# ---------------------------------------------------------
# 7) Extrae el DF de PREDICCIÓN y ordena columnas
# - Primero TODAS las columnas originales (orden original)
# - Después TODAS las columnas lag12 en el mismo orden de sus originales
# - Finalmente las columnas OHE (DIST_*)
# ---------------------------------------------------------
dataset_prediccion = full.loc[mask_future].copy()
# columnas originales (del histórico) que están en dataset_prediccion
orig_cols = [c for c in df.columns if c in dataset_prediccion.columns]
# columnas lag en el mismo orden (solo las existentes)
lag_cols = [f'{c}_lag12' for c in orig_cols if f'{c}_lag12' in dataset_prediccion.columns]
# columnas OHE
dist_cols = sorted([c for c in dataset_prediccion.columns if c.startswith('DIST_')])
# orden final
final_cols = orig_cols + lag_cols + dist_cols
dataset_prediccion = dataset_prediccion[final_cols]
print(dataset_prediccion.shape)
dataset_prediccion
(252, 51)
| AÑO | MES_NUM | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | INDICE_PRECIO | ... | DIST_Moratalaz | DIST_Puente De Vallecas | DIST_Retiro | DIST_Salamanca | DIST_San Blas-Canillejas | DIST_Tetuan | DIST_Usera | DIST_Vicalvaro | DIST_Villa De Vallecas | DIST_Villaverde | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 218 | 2025 | 7 | 40.40021 | -3.69618 | 6.462176e+06 | 0.03526 | 0.114 | <NA> | 6657.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | False |
| 219 | 2025 | 8 | 40.40021 | -3.69618 | 6.462176e+06 | 0.03169 | 0.114 | <NA> | 5423.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | False |
| 220 | 2025 | 9 | 40.40021 | -3.69618 | 6.462176e+06 | 0.02940 | 0.114 | <NA> | 7380.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | False |
| 221 | 2025 | 10 | 40.40021 | -3.69618 | 6.462176e+06 | 0.02686 | 0.126 | <NA> | 7380.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | False |
| 222 | 2025 | 11 | 40.40021 | -3.69618 | 6.462176e+06 | 0.02506 | 0.125 | <NA> | 6876.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4825 | 2026 | 2 | 40.37400 | -3.70000 | 2.019432e+07 | 0.02402 | 0.140 | <NA> | 6963.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | True |
| 4826 | 2026 | 3 | 40.37400 | -3.70000 | 2.019432e+07 | 0.02398 | 0.154 | <NA> | 7380.0 | 174.54 | ... | False | False | False | False | False | False | False | False | False | True |
| 4827 | 2026 | 4 | 40.37400 | -3.70000 | 2.019432e+07 | 0.02143 | 0.155 | <NA> | 6604.0 | 134.63 | ... | False | False | False | False | False | False | False | False | False | True |
| 4828 | 2026 | 5 | 40.37400 | -3.70000 | 2.019432e+07 | NaN | NaN | <NA> | NaN | NaN | ... | False | False | False | False | False | False | False | False | False | True |
| 4829 | 2026 | 6 | 40.37400 | -3.70000 | 2.019432e+07 | NaN | NaN | <NA> | NaN | NaN | ... | False | False | False | False | False | False | False | False | False | True |
252 rows × 51 columns
# ---------------------------- # PARÁMETROS # ---------------------------- start = '2025-07' # horizonte de predicción: inicio end = '2026-06' # horizonte de predicción: fin target_col = 'PRECIO M2' district_col = 'DISTRITO' year_col = 'AÑO' month_col = 'MES_NUM' # Solo lag de 12 meses LAGS = [12] # --------------------------------------------------------- # 1) Copia histórico y asegura tipos/orden temporal # --------------------------------------------------------- df = dataset_entrenamiento.copy() df[year_col] = df[year_col].astype(int) df[month_col] = df[month_col].astype(int) df['FECHA'] = pd.to_datetime( df[year_col].astype(str) + '-' + df[month_col].astype(str) + '-01' ) df = df.sort_values([district_col, 'FECHA']).reset_index(drop=True) # --------------------------------------------------------- # 2) Columnas para lag DESDE 'EURIBOR' EN ADELANTE (numéricas) # --------------------------------------------------------- idx_ini = df.columns.get_loc('EURIBOR') candidatas = df.columns[idx_ini:] # desde esa columna hacia la derecha num_cols = df.select_dtypes(include=[np.number]).columns cols_para_lag = [c for c in candidatas if c in num_cols] # mismas columnas y orden # --------------------------------------------------------- # 3) Crea filas FUTURAS por distrito (solo fechas) # --------------------------------------------------------- future_months = pd.period_range(start=start, end=end, freq='M').to_timestamp() future_rows = ( pd.MultiIndex.from_product([df[district_col].unique(), future_months], names=[district_col, 'FECHA']) .to_frame(index=False) ) future_rows[year_col] = future_rows['FECHA'].dt.year future_rows[month_col] = future_rows['FECHA'].dt.month # Añade columnas del histórico con NaN respetando el orden for c in df.columns: if c not in future_rows.columns: future_rows[c] = np.nan future_rows = future_rows[df.columns] # --------------------------------------------------------- # 4) Histórico + futuro y creación del lag12 por DISTRITO # --------------------------------------------------------- full = pd.concat([df, future_rows], ignore_index=True).sort_values([district_col, 'FECHA']) for col in cols_para_lag: full[f'{col}_lag12'] = full.groupby(district_col)[col].shift(12) # --------------------------------------------------------- # 5) Relleno en tramo de predicción con lag12 y columnas estáticas # --------------------------------------------------------- start_dt, end_dt = pd.to_datetime(start), pd.to_datetime(end) mask_future = (full['FECHA'] >= start_dt) & (full['FECHA'] <= end_dt) # columnas estáticas por distrito cols_estaticas = [c for c in ['LATITUD','LONGITUD','SHAPE_AREA'] if c in full.columns] for c in cols_estaticas: full[c] = full.groupby(district_col)[c].ffill().bfill() # Copiar el valor del lag12 si en el futuro está NaN (solo para las columnas a laggear) for c in cols_para_lag: lag12 = f'{c}_lag12' full.loc[mask_future, c] = full.loc[mask_future, c].where( full.loc[mask_future, c].notna(), full.loc[mask_future, lag12] ) # El TARGET debe quedar en NaN en el tramo futuro (lo predecirá el modelo) if target_col in full.columns: full.loc[mask_future, target_col] = np.nan # --------------------------------------------------------- # 6) Extrae el DF de PREDICCIÓN y ordena columnas # - Primero todas las columnas originales # - Después todas las columnas lag12 en el mismo orden # --------------------------------------------------------- dataset_prediccion = full.loc[mask_future].copy() # columnas originales (del histórico) orig_cols = [c for c in df.columns if c in dataset_prediccion.columns] # columnas lag en el mismo orden lag_cols = [f'{c}_lag12' for c in orig_cols if f'{c}_lag12' in dataset_prediccion.columns] # orden final final_cols = orig_cols + lag_cols dataset_prediccion = dataset_prediccion[final_cols] print(dataset_prediccion.shape) dataset_prediccion.head(10)
# Eliminar columnas que no son de lag
# Lista de columnas originales que quieres eliminar
cols_drop = [
'EURIBOR',
'VARIACIÓN ANUAL',
'TRANSACCIONES_CM',
'INDICE_PRECIO',
'TOTAL_HIPOTECAS',
'POBLACION_ACTIVA',
'POBLACION',
'ESPERANZA_VIDA',
'VIVIENDAS_COMPRAVENTA',
'TRANSACCIONES_SUELO',
'PRECIO_MEDIO_M2_CCMM',
'FECHA'
]
# Eliminarlas si existen en df_lag
dataset_prediccion = dataset_prediccion.drop(columns=[c for c in cols_drop if c in dataset_prediccion.columns])
dataset_prediccion.head()
| AÑO | MES_NUM | LATITUD | LONGITUD | SHAPE_AREA | PRECIO M2 | EURIBOR_lag12 | VARIACIÓN ANUAL_lag12 | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | ... | DIST_Moratalaz | DIST_Puente De Vallecas | DIST_Retiro | DIST_Salamanca | DIST_San Blas-Canillejas | DIST_Tetuan | DIST_Usera | DIST_Vicalvaro | DIST_Villa De Vallecas | DIST_Villaverde | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 218 | 2025 | 7 | 40.40021 | -3.69618 | 6.462176e+06 | <NA> | 0.03526 | 0.114 | 4957.0 | 6657.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 219 | 2025 | 8 | 40.40021 | -3.69618 | 6.462176e+06 | <NA> | 0.03169 | 0.114 | 4998.0 | 5423.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 220 | 2025 | 9 | 40.40021 | -3.69618 | 6.462176e+06 | <NA> | 0.02940 | 0.114 | 4999.0 | 7380.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 221 | 2025 | 10 | 40.40021 | -3.69618 | 6.462176e+06 | <NA> | 0.02686 | 0.126 | 5079.0 | 7380.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 222 | 2025 | 11 | 40.40021 | -3.69618 | 6.462176e+06 | <NA> | 0.02506 | 0.125 | 5062.0 | 6876.0 | ... | False | False | False | False | False | False | False | False | False | False |
5 rows × 39 columns
dataset_prediccion = dataset_prediccion.copy()
# columnas que NO quieres convertir (fechas y categóricas puras)
EXCLUDE = ["FECHA", "AÑO_MES", "DISTRITO"]
# 1) Intentar pasar a número todo lo que no sea fecha ni excluido
cols_try = [c for c in dataset_prediccion.columns
if c not in EXCLUDE and not is_datetime64_any_dtype(dataset_prediccion[c])]
dataset_prediccion[cols_try] = dataset_prediccion[cols_try].apply(pd.to_numeric, errors="coerce")
# 2) Asegurar float en todas las numéricas
num_cols = dataset_prediccion.select_dtypes(include="number").columns
dataset_prediccion[num_cols] = dataset_prediccion[num_cols].astype("Float64")
dataset_prediccion.info()
<class 'pandas.core.frame.DataFrame'> Index: 252 entries, 218 to 4829 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 252 non-null Float64 1 MES_NUM 252 non-null Float64 2 LATITUD 252 non-null Float64 3 LONGITUD 252 non-null Float64 4 SHAPE_AREA 252 non-null Float64 5 PRECIO M2 0 non-null Float64 6 EURIBOR_lag12 250 non-null Float64 7 VARIACIÓN ANUAL_lag12 250 non-null Float64 8 PRECIO M2_lag12 250 non-null Float64 9 TRANSACCIONES_CM_lag12 250 non-null Float64 10 INDICE_PRECIO_lag12 250 non-null Float64 11 TOTAL_HIPOTECAS_lag12 250 non-null Float64 12 POBLACION_ACTIVA_lag12 250 non-null Float64 13 POBLACION_lag12 250 non-null Float64 14 ESPERANZA_VIDA_lag12 250 non-null Float64 15 VIVIENDAS_COMPRAVENTA_lag12 250 non-null Float64 16 TRANSACCIONES_SUELO_lag12 250 non-null Float64 17 PRECIO_MEDIO_M2_CCMM_lag12 250 non-null Float64 18 DIST_Arganzuela 252 non-null bool 19 DIST_Barajas 252 non-null bool 20 DIST_Carabanchel 252 non-null bool 21 DIST_Centro 252 non-null bool 22 DIST_Chamartin 252 non-null bool 23 DIST_Chamberi 252 non-null bool 24 DIST_Ciudad Lineal 252 non-null bool 25 DIST_Fuencarral-El Pardo 252 non-null bool 26 DIST_Hortaleza 252 non-null bool 27 DIST_Latina 252 non-null bool 28 DIST_Moncloa-Aravaca 252 non-null bool 29 DIST_Moratalaz 252 non-null bool 30 DIST_Puente De Vallecas 252 non-null bool 31 DIST_Retiro 252 non-null bool 32 DIST_Salamanca 252 non-null bool 33 DIST_San Blas-Canillejas 252 non-null bool 34 DIST_Tetuan 252 non-null bool 35 DIST_Usera 252 non-null bool 36 DIST_Vicalvaro 252 non-null bool 37 DIST_Villa De Vallecas 252 non-null bool 38 DIST_Villaverde 252 non-null bool dtypes: Float64(18), bool(21) memory usage: 47.0 KB
# Rellenar los NaN con el valor del mes anterior dentro del mismo distrito y año.
# === Reconstruir distrito desde OHE y rellenar dentro del MISMO AÑO ===
dist_cols = [c for c in dataset_prediccion.columns if c.startswith('DIST_')]
# Recuperar etiqueta de distrito (columna auxiliar)
dataset_prediccion['__DISTRITO__'] = (
dataset_prediccion[dist_cols].idxmax(axis=1).str.replace('DIST_', '', regex=False)
)
# columnas numéricas a rellenar (evita OHE y el target)
cols_fill = [
c for c in dataset_prediccion.columns
if pd.api.types.is_numeric_dtype(dataset_prediccion[c])
and not c.startswith('DIST_')
and c != target_col
]
# ordenar y forward-fill por distrito y AÑO (enero no hereda diciembre)
dataset_prediccion = dataset_prediccion.sort_values(['__DISTRITO__', year_col, month_col]).reset_index(drop=True)
dataset_prediccion[cols_fill] = (
dataset_prediccion.groupby(['__DISTRITO__', year_col], group_keys=False)[cols_fill]
.ffill()
)
# opcional: si quieres también bfill dentro del año
# dataset_prediccion[cols_fill] = dataset_prediccion.groupby(['__DISTRITO__', year_col], group_keys=False)[cols_fill].ffill().bfill()
# limpiar auxiliar
dataset_prediccion = dataset_prediccion.drop(columns='__DISTRITO__')
dataset_prediccion.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 252 non-null Float64 1 MES_NUM 252 non-null Float64 2 LATITUD 252 non-null Float64 3 LONGITUD 252 non-null Float64 4 SHAPE_AREA 252 non-null Float64 5 PRECIO M2 0 non-null Float64 6 EURIBOR_lag12 252 non-null Float64 7 VARIACIÓN ANUAL_lag12 252 non-null Float64 8 PRECIO M2_lag12 252 non-null Float64 9 TRANSACCIONES_CM_lag12 252 non-null Float64 10 INDICE_PRECIO_lag12 252 non-null Float64 11 TOTAL_HIPOTECAS_lag12 252 non-null Float64 12 POBLACION_ACTIVA_lag12 252 non-null Float64 13 POBLACION_lag12 252 non-null Float64 14 ESPERANZA_VIDA_lag12 252 non-null Float64 15 VIVIENDAS_COMPRAVENTA_lag12 252 non-null Float64 16 TRANSACCIONES_SUELO_lag12 252 non-null Float64 17 PRECIO_MEDIO_M2_CCMM_lag12 252 non-null Float64 18 DIST_Arganzuela 252 non-null bool 19 DIST_Barajas 252 non-null bool 20 DIST_Carabanchel 252 non-null bool 21 DIST_Centro 252 non-null bool 22 DIST_Chamartin 252 non-null bool 23 DIST_Chamberi 252 non-null bool 24 DIST_Ciudad Lineal 252 non-null bool 25 DIST_Fuencarral-El Pardo 252 non-null bool 26 DIST_Hortaleza 252 non-null bool 27 DIST_Latina 252 non-null bool 28 DIST_Moncloa-Aravaca 252 non-null bool 29 DIST_Moratalaz 252 non-null bool 30 DIST_Puente De Vallecas 252 non-null bool 31 DIST_Retiro 252 non-null bool 32 DIST_Salamanca 252 non-null bool 33 DIST_San Blas-Canillejas 252 non-null bool 34 DIST_Tetuan 252 non-null bool 35 DIST_Usera 252 non-null bool 36 DIST_Vicalvaro 252 non-null bool 37 DIST_Villa De Vallecas 252 non-null bool 38 DIST_Villaverde 252 non-null bool dtypes: Float64(18), bool(21) memory usage: 45.2 KB
dataset_prediccion.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\dataset_prediccion.xlsx", index=False)
dataset_entrenamiento.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | INDICE_PRECIO | TOTAL_HIPOTECAS | POBLACION_ACTIVA | POBLACION | ESPERANZA_VIDA | VIVIENDAS_COMPRAVENTA | TRANSACCIONES_SUELO | PRECIO_MEDIO_M2_CCMM | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025.0 | 6.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02081 | 0.165 | 5729.0 | 6944.0 | 134.735 | 6964.0 | 3784.3 | 154118.0 | 84.95 | 1494.0 | 363.0 | 5.74518 |
| 1 | 2025.0 | 5.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02080 | 0.165 | 5669.0 | 7020.0 | 134.735 | 7598.0 | 3784.3 | 154118.0 | 84.95 | 1559.0 | 363.0 | 5.74518 |
| 2 | 2025.0 | 4.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02143 | 0.165 | 5472.0 | 6604.0 | 134.735 | 8425.0 | 3784.3 | 154118.0 | 84.95 | 1499.0 | 363.0 | 5.74518 |
| 3 | 2025.0 | 3.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02398 | 0.163 | 5341.0 | 7380.0 | 174.540 | 7973.0 | 3784.3 | 154118.0 | 84.95 | 1843.0 | 436.0 | 5.72802 |
| 4 | 2025.0 | 2.0 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.02402 | 0.162 | 5310.0 | 6963.0 | 174.540 | 6701.0 | 3784.3 | 154118.0 | 84.95 | 1786.0 | 436.0 | 5.72802 |
# Crear lag (-12) para variables a partir de EURIBOR en adelante
dataset_entrenamiento_lag = dataset_entrenamiento.copy()
# ordenar por distrito y tiempo
dataset_entrenamiento_lag['AÑO'] = dataset_entrenamiento_lag['AÑO'].astype(int)
dataset_entrenamiento_lag['MES_NUM'] = dataset_entrenamiento_lag['MES_NUM'].astype(int)
dataset_entrenamiento_lag['FECHA'] = pd.to_datetime(dataset_entrenamiento_lag['AÑO'].astype(str) + '-' + dataset_entrenamiento_lag['MES_NUM'].astype(str) + '-01')
dataset_entrenamiento_lag = dataset_entrenamiento_lag.sort_values(['DISTRITO', 'FECHA'])
# columnas desde 'EURIBOR' hasta el final del DataFrame
start_idx = dataset_entrenamiento_lag.columns.get_loc('EURIBOR')
cols_desde_euribor = dataset_entrenamiento_lag.columns[start_idx:]
# nos quedamos solo con las numéricas dentro de ese tramo para laggear
cols_lag = [c for c in cols_desde_euribor if pd.api.types.is_numeric_dtype(dataset_entrenamiento_lag[c])]
# crear lag 12 por DISTRITO
for c in cols_lag:
dataset_entrenamiento_lag[f'{c}_lag12'] = dataset_entrenamiento_lag.groupby('DISTRITO')[c].shift(12)
# eliminar columna temporal de fecha
dataset_entrenamiento_lag = dataset_entrenamiento_lag.drop(columns=['FECHA'])
# df_lag = df_lag.drop(columns='FECHA') # opcional
dataset_entrenamiento_lag.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | EURIBOR | VARIACIÓN ANUAL | PRECIO M2 | TRANSACCIONES_CM | ... | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | INDICE_PRECIO_lag12 | TOTAL_HIPOTECAS_lag12 | POBLACION_ACTIVA_lag12 | POBLACION_lag12 | ESPERANZA_VIDA_lag12 | VIVIENDAS_COMPRAVENTA_lag12 | TRANSACCIONES_SUELO_lag12 | PRECIO_MEDIO_M2_CCMM_lag12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211 | 2007 | 11 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.04498 | 0.0125 | 4000.0 | 6058.0 | ... | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 210 | 2007 | 12 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.04498 | 0.0125 | 4000.0 | 4577.0 | ... | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 209 | 2008 | 1 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.04498 | 0.0125 | 4000.0 | 6123.0 | ... | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 208 | 2008 | 2 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.04349 | 0.0125 | 3824.0 | 5964.0 | ... | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 207 | 2008 | 3 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 0.04498 | 0.0125 | 4000.0 | 5525.0 | ... | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 30 columns
# Eliminar columnas que no son de lag
# Lista de columnas originales que quieres eliminar
cols_drop = [
'EURIBOR',
'VARIACIÓN ANUAL',
'TRANSACCIONES_CM',
'INDICE_PRECIO',
'TOTAL_HIPOTECAS',
'POBLACION_ACTIVA',
'POBLACION',
'ESPERANZA_VIDA',
'VIVIENDAS_COMPRAVENTA',
'TRANSACCIONES_SUELO',
'PRECIO_MEDIO_M2_CCMM',
'FECHA'
]
# Eliminarlas si existen en df_lag
dataset_entrenamiento_lag = dataset_entrenamiento_lag.drop(columns=[c for c in cols_drop if c in dataset_entrenamiento_lag.head().columns])
dataset_entrenamiento_lag.head()
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | PRECIO M2 | EURIBOR_lag12 | VARIACIÓN ANUAL_lag12 | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | INDICE_PRECIO_lag12 | TOTAL_HIPOTECAS_lag12 | POBLACION_ACTIVA_lag12 | POBLACION_lag12 | ESPERANZA_VIDA_lag12 | VIVIENDAS_COMPRAVENTA_lag12 | TRANSACCIONES_SUELO_lag12 | PRECIO_MEDIO_M2_CCMM_lag12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 211 | 2007 | 11 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 4000.0 | NaN | NaN | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 210 | 2007 | 12 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 4000.0 | NaN | NaN | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 209 | 2008 | 1 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 4000.0 | NaN | NaN | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 208 | 2008 | 2 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 3824.0 | NaN | NaN | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 207 | 2008 | 3 | Arganzuela | 40.40021 | -3.69618 | 6.462176e+06 | 4000.0 | NaN | NaN | <NA> | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
print("Número total de columnas:", dataset_entrenamiento_lag.shape[1])
Número total de columnas: 19
# Análisis valores nulos por variable
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(dataset_entrenamiento_lag)
print(f"Filas totales: {len(dataset_entrenamiento_lag):,}".replace(",", "."))
res
Filas totales: 4.418
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | VARIACIÓN ANUAL_lag12 | 252 | 5.7 |
| 1 | PRECIO M2_lag12 | 252 | 5.7 |
| 2 | TRANSACCIONES_CM_lag12 | 252 | 5.7 |
| 3 | ESPERANZA_VIDA_lag12 | 252 | 5.7 |
| 4 | VIVIENDAS_COMPRAVENTA_lag12 | 252 | 5.7 |
| 5 | TRANSACCIONES_SUELO_lag12 | 252 | 5.7 |
| 6 | EURIBOR_lag12 | 252 | 5.7 |
| 7 | POBLACION_lag12 | 252 | 5.7 |
| 8 | POBLACION_ACTIVA_lag12 | 252 | 5.7 |
| 9 | TOTAL_HIPOTECAS_lag12 | 252 | 5.7 |
| 10 | INDICE_PRECIO_lag12 | 252 | 5.7 |
| 11 | PRECIO_MEDIO_M2_CCMM_lag12 | 252 | 5.7 |
| 12 | DISTRITO | 0 | 0.0 |
| 13 | AÑO | 0 | 0.0 |
| 14 | MES_NUM | 0 | 0.0 |
| 15 | LATITUD | 0 | 0.0 |
| 16 | LONGITUD | 0 | 0.0 |
| 17 | SHAPE_AREA | 0 | 0.0 |
| 18 | PRECIO M2 | 0 | 0.0 |
# Eliminar filas con nulos en cualquier columna
dataset_entrenamiento_lag = dataset_entrenamiento_lag.dropna(axis=0)
# Reiniciar el índice después de eliminar filas
#dataset_entrenamiento_lag = dataset_entrenamiento_lag.reset_index(drop=True)
# Análisis valores nulos por variable
def resumen_nulos(df: pd.DataFrame) -> pd.DataFrame:
total = len(df)
nulos = df.isna().sum()
pct = (nulos / total * 100).round(2)
resumen = (
pd.DataFrame({"n_nulos": nulos, "%_nulos": pct})
.sort_values("%_nulos", ascending=False)
.reset_index()
.rename(columns={"index": "columna"})
)
return resumen
# Uso:
res = resumen_nulos(dataset_entrenamiento_lag)
print(f"Filas totales: {len(dataset_entrenamiento_lag):,}".replace(",", "."))
res
Filas totales: 4.166
| columna | n_nulos | %_nulos | |
|---|---|---|---|
| 0 | AÑO | 0 | 0.0 |
| 1 | MES_NUM | 0 | 0.0 |
| 2 | DISTRITO | 0 | 0.0 |
| 3 | LATITUD | 0 | 0.0 |
| 4 | LONGITUD | 0 | 0.0 |
| 5 | SHAPE_AREA | 0 | 0.0 |
| 6 | PRECIO M2 | 0 | 0.0 |
| 7 | EURIBOR_lag12 | 0 | 0.0 |
| 8 | VARIACIÓN ANUAL_lag12 | 0 | 0.0 |
| 9 | PRECIO M2_lag12 | 0 | 0.0 |
| 10 | TRANSACCIONES_CM_lag12 | 0 | 0.0 |
| 11 | INDICE_PRECIO_lag12 | 0 | 0.0 |
| 12 | TOTAL_HIPOTECAS_lag12 | 0 | 0.0 |
| 13 | POBLACION_ACTIVA_lag12 | 0 | 0.0 |
| 14 | POBLACION_lag12 | 0 | 0.0 |
| 15 | ESPERANZA_VIDA_lag12 | 0 | 0.0 |
| 16 | VIVIENDAS_COMPRAVENTA_lag12 | 0 | 0.0 |
| 17 | TRANSACCIONES_SUELO_lag12 | 0 | 0.0 |
| 18 | PRECIO_MEDIO_M2_CCMM_lag12 | 0 | 0.0 |
Se eliminaron 252 registros con valores nulos (5,7% del total), dado que su proporción era baja y no representaban un impacto significativo en la muestra. Esta decisión evita introducir sesgo mediante imputaciones artificiales. Estos nulos se deben principalmente a los primeros años (2007-2009)
# Número de filas y columnas
n_filas, n_columnas = dataset_entrenamiento_lag.shape
print(f"Filas: {n_filas}")
print(f"Columnas: {n_columnas}")
Filas: 4166 Columnas: 19
dataset_entrenamiento_lag = dataset_entrenamiento_lag.copy()
# columnas que NO quieres convertir (fechas y categóricas puras)
EXCLUDE = ["DISTRITO"]
# 1) Intentar pasar a número todo lo que no sea fecha ni excluido
cols_try = [c for c in dataset_entrenamiento_lag.columns
if c not in EXCLUDE and not is_datetime64_any_dtype(dataset_entrenamiento_lag[c])]
dataset_entrenamiento_lag[cols_try] = dataset_entrenamiento_lag[cols_try].apply(pd.to_numeric, errors="coerce")
# 2) Asegurar float en todas las numéricas
num_cols = dataset_entrenamiento_lag.select_dtypes(include="number").columns
dataset_entrenamiento_lag[num_cols] = dataset_entrenamiento_lag[num_cols].astype("Float64")
dataset_entrenamiento_lag.info()
<class 'pandas.core.frame.DataFrame'> Index: 4166 entries, 199 to 4210 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4166 non-null Float64 1 MES_NUM 4166 non-null Float64 2 DISTRITO 4166 non-null string 3 LATITUD 4166 non-null Float64 4 LONGITUD 4166 non-null Float64 5 SHAPE_AREA 4166 non-null Float64 6 PRECIO M2 4166 non-null Float64 7 EURIBOR_lag12 4166 non-null Float64 8 VARIACIÓN ANUAL_lag12 4166 non-null Float64 9 PRECIO M2_lag12 4166 non-null Float64 10 TRANSACCIONES_CM_lag12 4166 non-null Float64 11 INDICE_PRECIO_lag12 4166 non-null Float64 12 TOTAL_HIPOTECAS_lag12 4166 non-null Float64 13 POBLACION_ACTIVA_lag12 4166 non-null Float64 14 POBLACION_lag12 4166 non-null Float64 15 ESPERANZA_VIDA_lag12 4166 non-null Float64 16 VIVIENDAS_COMPRAVENTA_lag12 4166 non-null Float64 17 TRANSACCIONES_SUELO_lag12 4166 non-null Float64 18 PRECIO_MEDIO_M2_CCMM_lag12 4166 non-null Float64 dtypes: Float64(18), string(1) memory usage: 724.2 KB
# One Hot Encoding con pandas
dataset_entrenamiento_lag1 = pd.get_dummies(dataset_entrenamiento_lag,
columns=["DISTRITO"],
prefix="DIST",
drop_first=False)
# Ver las nuevas columnas
print(dataset_entrenamiento_lag1.head())
AÑO MES_NUM LATITUD LONGITUD SHAPE_AREA PRECIO M2 \
199 2008.0 11.0 40.40021 -3.69618 6462175.91405 3714.0
198 2008.0 12.0 40.40021 -3.69618 6462175.91405 3667.0
197 2009.0 1.0 40.40021 -3.69618 6462175.91405 3782.0
196 2009.0 2.0 40.40021 -3.69618 6462175.91405 3722.0
195 2009.0 3.0 40.40021 -3.69618 6462175.91405 3722.0
EURIBOR_lag12 VARIACIÓN ANUAL_lag12 PRECIO M2_lag12 \
199 0.04498 0.0125 4000.0
198 0.04498 0.0125 4000.0
197 0.04498 0.0125 4000.0
196 0.04349 0.0125 3824.0
195 0.04498 0.0125 4000.0
TRANSACCIONES_CM_lag12 ... DIST_Moratalaz DIST_Puente De Vallecas \
199 6058.0 ... False False
198 4577.0 ... False False
197 6123.0 ... False False
196 5964.0 ... False False
195 5525.0 ... False False
DIST_Retiro DIST_Salamanca DIST_San Blas-Canillejas DIST_Tetuan \
199 False False False False
198 False False False False
197 False False False False
196 False False False False
195 False False False False
DIST_Usera DIST_Vicalvaro DIST_Villa De Vallecas DIST_Villaverde
199 False False False False
198 False False False False
197 False False False False
196 False False False False
195 False False False False
[5 rows x 39 columns]
dataset_entrenamiento_lag1.info()
<class 'pandas.core.frame.DataFrame'> Index: 4166 entries, 199 to 4210 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AÑO 4166 non-null Float64 1 MES_NUM 4166 non-null Float64 2 LATITUD 4166 non-null Float64 3 LONGITUD 4166 non-null Float64 4 SHAPE_AREA 4166 non-null Float64 5 PRECIO M2 4166 non-null Float64 6 EURIBOR_lag12 4166 non-null Float64 7 VARIACIÓN ANUAL_lag12 4166 non-null Float64 8 PRECIO M2_lag12 4166 non-null Float64 9 TRANSACCIONES_CM_lag12 4166 non-null Float64 10 INDICE_PRECIO_lag12 4166 non-null Float64 11 TOTAL_HIPOTECAS_lag12 4166 non-null Float64 12 POBLACION_ACTIVA_lag12 4166 non-null Float64 13 POBLACION_lag12 4166 non-null Float64 14 ESPERANZA_VIDA_lag12 4166 non-null Float64 15 VIVIENDAS_COMPRAVENTA_lag12 4166 non-null Float64 16 TRANSACCIONES_SUELO_lag12 4166 non-null Float64 17 PRECIO_MEDIO_M2_CCMM_lag12 4166 non-null Float64 18 DIST_Arganzuela 4166 non-null boolean 19 DIST_Barajas 4166 non-null boolean 20 DIST_Carabanchel 4166 non-null boolean 21 DIST_Centro 4166 non-null boolean 22 DIST_Chamartin 4166 non-null boolean 23 DIST_Chamberi 4166 non-null boolean 24 DIST_Ciudad Lineal 4166 non-null boolean 25 DIST_Fuencarral-El Pardo 4166 non-null boolean 26 DIST_Hortaleza 4166 non-null boolean 27 DIST_Latina 4166 non-null boolean 28 DIST_Moncloa-Aravaca 4166 non-null boolean 29 DIST_Moratalaz 4166 non-null boolean 30 DIST_Puente De Vallecas 4166 non-null boolean 31 DIST_Retiro 4166 non-null boolean 32 DIST_Salamanca 4166 non-null boolean 33 DIST_San Blas-Canillejas 4166 non-null boolean 34 DIST_Tetuan 4166 non-null boolean 35 DIST_Usera 4166 non-null boolean 36 DIST_Vicalvaro 4166 non-null boolean 37 DIST_Villa De Vallecas 4166 non-null boolean 38 DIST_Villaverde 4166 non-null boolean dtypes: Float64(18), boolean(21) memory usage: 862.5 KB
## Sin One Hot Encoding
dataset_entrenamiento_lag.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\dataset_entrenamiento_lag.xlsx", index=False)
## Con One Hot Encoding
dataset_entrenamiento_lag1.to_excel(r"C:\Users\evahr\Downloads\TFM-idealista\dataset_entrenamiento_lag1.xlsx", index=False)
dataset_entrenamiento_lag1
| AÑO | MES_NUM | LATITUD | LONGITUD | SHAPE_AREA | PRECIO M2 | EURIBOR_lag12 | VARIACIÓN ANUAL_lag12 | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | ... | DIST_Moratalaz | DIST_Puente De Vallecas | DIST_Retiro | DIST_Salamanca | DIST_San Blas-Canillejas | DIST_Tetuan | DIST_Usera | DIST_Vicalvaro | DIST_Villa De Vallecas | DIST_Villaverde | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 199 | 2008.0 | 11.0 | 40.40021 | -3.69618 | 6462175.91405 | 3714.0 | 0.04498 | 0.0125 | 4000.0 | 6058.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 198 | 2008.0 | 12.0 | 40.40021 | -3.69618 | 6462175.91405 | 3667.0 | 0.04498 | 0.0125 | 4000.0 | 4577.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 197 | 2009.0 | 1.0 | 40.40021 | -3.69618 | 6462175.91405 | 3782.0 | 0.04498 | 0.0125 | 4000.0 | 6123.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 196 | 2009.0 | 2.0 | 40.40021 | -3.69618 | 6462175.91405 | 3722.0 | 0.04349 | 0.0125 | 3824.0 | 5964.0 | ... | False | False | False | False | False | False | False | False | False | False |
| 195 | 2009.0 | 3.0 | 40.40021 | -3.69618 | 6462175.91405 | 3722.0 | 0.04498 | 0.0125 | 4000.0 | 5525.0 | ... | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4214 | 2024.0 | 12.0 | 40.374 | -3.7 | 20194322.8326 | 2252.0 | 0.03679 | 0.063 | 1963.0 | 4048.0 | ... | False | False | False | False | False | False | False | False | False | True |
| 4213 | 2025.0 | 1.0 | 40.374 | -3.7 | 20194322.8326 | 2207.0 | 0.03609 | 0.068 | 1982.0 | 6209.0 | ... | False | False | False | False | False | False | False | False | False | True |
| 4212 | 2025.0 | 2.0 | 40.374 | -3.7 | 20194322.8326 | 2254.0 | 0.03671 | 0.058 | 1977.0 | 6336.0 | ... | False | False | False | False | False | False | False | False | False | True |
| 4211 | 2025.0 | 3.0 | 40.374 | -3.7 | 20194322.8326 | 2294.0 | 0.03716 | 0.067 | 1988.0 | 5924.0 | ... | False | False | False | False | False | False | False | False | False | True |
| 4210 | 2025.0 | 4.0 | 40.374 | -3.7 | 20194322.8326 | 2362.0 | 0.03702 | 0.085 | 2046.0 | 5805.0 | ... | False | False | False | False | False | False | False | False | False | True |
4166 rows × 39 columns
dataset_entrenamiento_lag
| AÑO | MES_NUM | DISTRITO | LATITUD | LONGITUD | SHAPE_AREA | PRECIO M2 | EURIBOR_lag12 | VARIACIÓN ANUAL_lag12 | PRECIO M2_lag12 | TRANSACCIONES_CM_lag12 | INDICE_PRECIO_lag12 | TOTAL_HIPOTECAS_lag12 | POBLACION_ACTIVA_lag12 | POBLACION_lag12 | ESPERANZA_VIDA_lag12 | VIVIENDAS_COMPRAVENTA_lag12 | TRANSACCIONES_SUELO_lag12 | PRECIO_MEDIO_M2_CCMM_lag12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 199 | 2008.0 | 11.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3714.0 | 0.04498 | 0.0125 | 4000.0 | 6058.0 | 154.51 | 12744.0 | 3365.3 | 153868.0 | 83.4 | 2539.0 | 464.0 | 6.271159 |
| 198 | 2008.0 | 12.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3667.0 | 0.04498 | 0.0125 | 4000.0 | 4577.0 | 154.51 | 9041.0 | 3365.3 | 153218.0 | 83.4 | 1819.0 | 464.0 | 6.271159 |
| 197 | 2009.0 | 1.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3782.0 | 0.04498 | 0.0125 | 4000.0 | 6123.0 | 152.9 | 12744.0 | 3375.6 | 153670.0 | 83.3 | 2278.0 | 458.0 | 6.271159 |
| 196 | 2009.0 | 2.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3722.0 | 0.04349 | 0.0125 | 3824.0 | 5964.0 | 152.9 | 12744.0 | 3375.6 | 153794.0 | 83.3 | 2347.0 | 458.0 | 6.271159 |
| 195 | 2009.0 | 3.0 | Arganzuela | 40.40021 | -3.69618 | 6462175.91405 | 3722.0 | 0.04498 | 0.0125 | 4000.0 | 5525.0 | 152.9 | 12024.0 | 3375.6 | 154370.0 | 83.3 | 2507.0 | 458.0 | 6.271159 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4214 | 2024.0 | 12.0 | Villaverde | 40.374 | -3.7 | 20194322.8326 | 2252.0 | 0.03679 | 0.063 | 1963.0 | 4048.0 | 164.82 | 6548.0 | 3720.2 | 313118.0 | 85.9 | 917.0 | 345.0 | 5.677883 |
| 4213 | 2025.0 | 1.0 | Villaverde | 40.374 | -3.7 | 20194322.8326 | 2207.0 | 0.03609 | 0.068 | 1982.0 | 6209.0 | 168.79 | 8547.0 | 3750.8 | 313118.0 | 86.4 | 1339.0 | 294.0 | 5.750698 |
| 4212 | 2025.0 | 2.0 | Villaverde | 40.374 | -3.7 | 20194322.8326 | 2254.0 | 0.03671 | 0.058 | 1977.0 | 6336.0 | 168.79 | 8492.0 | 3750.8 | 313118.0 | 86.4 | 1541.0 | 294.0 | 5.750698 |
| 4211 | 2025.0 | 3.0 | Villaverde | 40.374 | -3.7 | 20194322.8326 | 2294.0 | 0.03716 | 0.067 | 1988.0 | 5924.0 | 168.79 | 7033.0 | 3750.8 | 313118.0 | 86.4 | 1213.0 | 294.0 | 5.750698 |
| 4210 | 2025.0 | 4.0 | Villaverde | 40.374 | -3.7 | 20194322.8326 | 2362.0 | 0.03702 | 0.085 | 2046.0 | 5805.0 | 174.54 | 7177.0 | 3749.9 | 313118.0 | 86.4 | 1077.0 | 297.0 | 5.599458 |
4166 rows × 19 columns